You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by rh...@apache.org on 2015/03/19 20:06:41 UTC

svn commit: r1667851 - in /manifoldcf/branches/CONNECTORS-1168/connectors/searchblox: ./ connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/ connector/src/test/java/org/apache/manifoldcf/agents/output/searchblox/tests/

Author: rharo
Date: Thu Mar 19 19:06:41 2015
New Revision: 1667851

URL: http://svn.apache.org/r1667851
Log:
CONNECTORS-1168: Added JSON indecing

Modified:
    manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxClient.java
    manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxDocument.java
    manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/test/java/org/apache/manifoldcf/agents/output/searchblox/tests/SearchBloxDocumentTest.java
    manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/pom.xml

Modified: manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxClient.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxClient.java?rev=1667851&r1=1667850&r2=1667851&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxClient.java (original)
+++ manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxClient.java Thu Mar 19 19:06:41 2015
@@ -19,6 +19,8 @@ package org.apache.manifoldcf.agents.out
 import java.io.IOException;
 import java.io.StringReader;
 import java.util.UUID;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import javax.ws.rs.client.Client;
 import javax.ws.rs.client.ClientBuilder;
@@ -29,9 +31,12 @@ import javax.ws.rs.core.MediaType;
 import javax.ws.rs.core.Response;
 import javax.ws.rs.core.UriBuilder;
 
+import org.apache.manifoldcf.agents.output.searchblox.SearchBloxDocument.IndexingFormat;
 import org.apache.manifoldcf.crawler.system.Logging;
 import org.apache.xerces.parsers.DOMParser;
 import org.jboss.resteasy.plugins.providers.StringTextStar;
+import org.json.JSONException;
+import org.json.JSONObject;
 import org.w3c.dom.Document;
 import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;
@@ -58,6 +63,8 @@ public class SearchBloxClient {
     private static final String CLEAR_PATH = "clear";
 
     private static final String STATUS_NODE = "statuscode";
+    
+    private static final Pattern status_pattern = Pattern.compile("^status code\\s:\\s([0-9]+)$");
 
     public static enum ResponseCode {
         DOCUMENT_INDEXED(100),
@@ -168,16 +175,21 @@ public class SearchBloxClient {
 
         WebTarget target = client.target(uri.build());
         Builder httpRequest = target.request();
-        httpRequest.accept(MediaType.TEXT_XML_TYPE);
+        if (iFormat == SearchBloxDocument.IndexingFormat.JSON) {
+        	httpRequest.accept(MediaType.APPLICATION_JSON_TYPE);
+        }else{
+        	httpRequest.accept(MediaType.APPLICATION_XML_TYPE);
+        }
+        
 
         document.apiKey = this.apikey;
         
         String body = document.toString(iFormat, action);
-        Logging.connectors.debug("XML Document for document: " + document.uid +":" + body);
+        Logging.connectors.debug("Document for document: " + document.uid +":" + body);
         MediaType type = MediaType.TEXT_PLAIN_TYPE;
-//        if (iFormat == SearchBloxDocument.IndexingFormat.JSON) {
-//            type = MediaType.APPLICATION_JSON_TYPE;
-//        }
+        if (iFormat == SearchBloxDocument.IndexingFormat.JSON) {
+            type = MediaType.APPLICATION_JSON_TYPE;
+        }
 
         
         Entity<String> entity = Entity.entity(body, type);
@@ -191,23 +203,45 @@ public class SearchBloxClient {
             return ResponseCode.SERVER_UNREACHABLE;
         }
         
-        String xmlResponse = response.readEntity(String.class);
-        DOMParser parser = new DOMParser();
-        try {
-            parser.parse(new InputSource(new StringReader(xmlResponse)));
-        } catch (SAXException | IOException e) {
-            Logging.connectors.error("[Response parsing] Dom parsing error", e);
-            throw new SearchBloxException(e);
-        }
-        Document doc = parser.getDocument();
-        NodeList nodeList = doc.getElementsByTagName(STATUS_NODE);
-        if (nodeList == null || nodeList.getLength() == 0) {
-        	String message = "[Response Parsing] Status code not found";
-        	Logging.connectors.error(message);
-            throw new SearchBloxException(message);
-        }
-        String codeStr = nodeList.item(0).getTextContent();
-        int statusCode = Integer.parseInt(codeStr);
-        return ResponseCode.getCodeFromValue(statusCode);
+        String rawResponse = response.readEntity(String.class);
+        if(iFormat == IndexingFormat.XML){
+        	DOMParser parser = new DOMParser();
+        	try {
+        		parser.parse(new InputSource(new StringReader(rawResponse)));
+        	} catch (SAXException | IOException e) {
+        		Logging.connectors.error("[Response parsing] Dom parsing error", e);
+        		throw new SearchBloxException(e);
+        	}
+        	Document doc = parser.getDocument();
+        	NodeList nodeList = doc.getElementsByTagName(STATUS_NODE);
+        	if (nodeList == null || nodeList.getLength() == 0) {
+        		String message = "[Response Parsing] Status code not found";
+        		Logging.connectors.error(message);
+        		throw new SearchBloxException(message);
+        	}
+        	String codeStr = nodeList.item(0).getTextContent();
+        	int statusCode = Integer.parseInt(codeStr);
+        	return ResponseCode.getCodeFromValue(statusCode);
+        }else{
+//        	try {
+//				JSONObject json = new JSONObject(rawResponse);
+//				String codeStr = json.getString(STATUS_NODE);
+        		Matcher matcher = status_pattern.matcher(rawResponse);
+        		String codeStr = null;
+        		if(matcher.find())
+        			codeStr = matcher.group();
+        		if(codeStr == null){
+        			String message = "[Response parsing] Resoponse code parsing error";
+        			Logging.connectors.error(message);
+            		throw new SearchBloxException(message);
+        		}
+        			
+				int statusCode = Integer.parseInt(codeStr);
+	        	return ResponseCode.getCodeFromValue(statusCode);
+//			} catch (JSONException e) {
+//				Logging.connectors.error("[Response parsing] Response JSON parsing error", e);
+//        		throw new SearchBloxException(e);
+//			}
+        }
     }
 }

Modified: manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxDocument.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxDocument.java?rev=1667851&r1=1667850&r2=1667851&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxDocument.java (original)
+++ manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxDocument.java Thu Mar 19 19:06:41 2015
@@ -16,16 +16,15 @@
  */
 package org.apache.manifoldcf.agents.output.searchblox;
 
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Multimap;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang.StringUtils;
-import org.apache.manifoldcf.agents.interfaces.RepositoryDocument;
-import org.apache.manifoldcf.crawler.system.Logging;
-import org.jsoup.Jsoup;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.text.SimpleDateFormat;
+import java.util.Collection;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
 
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
@@ -34,11 +33,21 @@ import javax.xml.transform.TransformerEx
 import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.text.SimpleDateFormat;
-import java.util.*;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.manifoldcf.agents.interfaces.RepositoryDocument;
+import org.apache.manifoldcf.crawler.system.Logging;
+import org.json.JSONArray;
+import org.json.JSONException;
+import org.json.JSONObject;
+import org.jsoup.Jsoup;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multimap;
 
 /**
  * "Package" class modeling a SearchBox document as a POJO
@@ -59,8 +68,8 @@ public class SearchBloxDocument {
 	public enum DocumentAction {
 		ADD_UPDATE, DELETE, STATUS, CREATE, CLEAR
 	}
-    static final List<String> xmlElements= Lists.newArrayList("searchblox","document","url","title","keywords","content","description","lastmodified","size",
-            "alpha","contenttype","category","meta","uid");
+	static final List<String> xmlElements= Lists.newArrayList("searchblox","document","url","title","keywords","content","description","lastmodified","size",
+			"alpha","contenttype","category","meta","uid");
 
 	static final String COLNAME_ATTRIBUTE = "colname";
 	static final String APIKEY_ATTRIBUTE = "apikey";
@@ -100,7 +109,7 @@ public class SearchBloxDocument {
 		Date date = rd.getModifiedDate();
 		if(date!=null){
 			data_fields.put(xmlElements.get(7),
-				dateFormat.format(rd.getModifiedDate()));
+					dateFormat.format(rd.getModifiedDate()));
 		}	
 
 		// content
@@ -112,25 +121,25 @@ public class SearchBloxDocument {
 				content = this.buildString(rd.getBinaryStream());
 		} catch (IOException e) {
 			Logging.connectors
-					.error("[Parsing Content]Content is not text plain, verify you are properly using Apache Tika Transformer",
-                            e);
+			.error("[Parsing Content]Content is not text plain, verify you are properly using Apache Tika Transformer",
+					e);
 		}
 		data_fields.put(xmlElements.get(5), this.clean(content));
 
 		// Content Type
 		data_fields.put(xmlElements.get(10), rd.getMimeType());
-		
+
 		// Size
 		data_fields.put(xmlElements.get(8), "" + rd.getBinaryLength());
 
 		// Boosting
 		for(String boostId:args.keySet()){
-            if(boostId.endsWith("_boost")){
-                List<String> argBoost = args.get(boostId);
-                if(argBoost!=null && !argBoost.isEmpty())
-                    data_fields.put(boostId,argBoost.get(0));
-            }
-        }
+			if(boostId.endsWith("_boost")){
+				List<String> argBoost = args.get(boostId);
+				if(argBoost!=null && !argBoost.isEmpty())
+					data_fields.put(boostId,argBoost.get(0));
+			}
+		}
 
 		// Metadata
 		Multimap<String, String> metadata = HashMultimap.create();
@@ -159,32 +168,32 @@ public class SearchBloxDocument {
 		}
 
 		// ACLS must be stored as metadata, as Searchblox use that construct to index custom named fields
-        //the approach has been implemented and tested live
+		//the approach has been implemented and tested live
 		Iterator<String> aclTypes = rd.securityTypesIterator();
 		while (aclTypes.hasNext()) {
 			String aclType = aclTypes.next();
 			String[] allow_tokens = rd.getSecurityACL(aclType);
 			for (String token : allow_tokens)
 				metadata.put(aclType+"_allow", token);
-            String[] deny_tokens = rd.getSecurityDenyACL(aclType);
-            for (String token : deny_tokens)
-                metadata.put(aclType+"_deny", token);
+			String[] deny_tokens = rd.getSecurityDenyACL(aclType);
+			for (String token : deny_tokens)
+				metadata.put(aclType+"_deny", token);
 		}
-        data_fields.put(xmlElements.get(12), metadata);
+		data_fields.put(xmlElements.get(12), metadata);
 	}
 
-    /**
-     * Clean a String from html tags or  break lines
-     * @param content
-     * @return
-     */
-    private String clean(String content) {
-        content = content.replaceAll("(\r\n|\n)", " ");
-        String cleanContent= Jsoup.parseBodyFragment(content).text();
-        return cleanContent;
-    }
+	/**
+	 * Clean a String from html tags or  break lines
+	 * @param content
+	 * @return
+	 */
+	private String clean(String content) {
+		content = content.replaceAll("(\r\n|\n)", " ");
+		String cleanContent= Jsoup.parseBodyFragment(content).text();
+		return cleanContent;
+	}
 
-    private String buildString(InputStream binaryStream) throws IOException {
+	private String buildString(InputStream binaryStream) throws IOException {
 		StringWriter writer = new StringWriter();
 		IOUtils.copy(binaryStream, writer, "UTF-8");
 		return writer.toString();
@@ -192,6 +201,81 @@ public class SearchBloxDocument {
 
 	public String toString(IndexingFormat format, DocumentAction action)
 			throws SearchBloxException {
+		if(format == IndexingFormat.XML)
+			return toStringXML(action);
+		else
+			return toStringJSON(action);
+	}
+
+	private String toStringJSON(DocumentAction action) throws SearchBloxException {
+		JSONObject result = new JSONObject();
+		if (apiKey == null)
+			throw new SearchBloxException(
+					"The API Key for accessing SearchBlox Server CAN'T be NULL");
+		try {
+			result.put(APIKEY_ATTRIBUTE, apiKey);
+
+			JSONObject document = new JSONObject();
+			if (colName == null)
+				throw new SearchBloxException(
+						"The Collection Name of the SearchBlox Server CAN'T be NULL");
+			document.put(COLNAME_ATTRIBUTE, colName);
+			document.put(UID_ATTRIBUTE, uid);
+
+			if(action == DocumentAction.ADD_UPDATE){
+				for(String element:xmlElements){
+					if (!element.equals(xmlElements.get(12))) {
+						Collection<Object> values = data_fields.get(element);
+						if (values!=null && values.size()>0) {
+							Object next = values.iterator()
+									.next();
+							String value =(String) next;
+							if (value != null && !value.isEmpty()) {
+								if(element.equals("keywords"))
+									document.put(element, StringUtils.join(values, ','));
+								else
+									document.put(element, value);
+//								Collection<Object> boostElement = data_fields
+//										.get(element + "_boost");
+//								if(boostElement!=null && boostElement.size()>0){
+//									String value_boost = (String) boostElement.iterator()
+//											.next();
+//									eValue.setAttribute(BOOST_ATTRIBUTE, "" + value_boost);
+//								}
+								
+							}
+						}
+					}
+				}
+
+				// Metadata
+				Collection<Object> metadataSet = data_fields
+						.get(xmlElements.get(12));
+				JSONObject metaObject = new JSONObject();
+				if(metadataSet!=null && metadataSet.size()>0){
+					Multimap<String, String> metadata = (Multimap<String, String>) metadataSet.iterator().next();
+					if (metadata != null && !metadata.isEmpty()) {
+						for (String name : metadata.keySet()){
+							JSONArray nextMetadata = new JSONArray();
+							for (String value : metadata.get(name)) {
+								nextMetadata.put(value);
+							}
+							metaObject.put(name, nextMetadata);
+						}
+					}  
+				}
+				document.put(xmlElements.get(12), metaObject);
+			}
+
+			result.put(xmlElements.get(1), document);
+
+		} catch (JSONException e) {
+			throw new SearchBloxException("Error while building Document JSON object", e);
+		}
+		return result.toString();
+	}
+
+	private String toStringXML(DocumentAction action) throws SearchBloxException{
 		Document doc = null;
 		try {
 			doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
@@ -213,62 +297,61 @@ public class SearchBloxDocument {
 			throw new SearchBloxException(
 					"The Collection Name of the SearchBlox Server CAN'T be NULL");
 		document.setAttribute(COLNAME_ATTRIBUTE, colName);
-        if(action == DocumentAction.DELETE)
-            document.setAttribute(UID_ATTRIBUTE,uid);
+		if(action == DocumentAction.DELETE)
+			document.setAttribute(UID_ATTRIBUTE,uid);
 		root.appendChild(document);
 
 		if (action == DocumentAction.ADD_UPDATE) {
-            // Uid
-            if (uid != null && !uid.isEmpty()) {
-                Element uidElement = doc.createElement(xmlElements.get(13));
-                uidElement.setTextContent(uid);
-                document.appendChild(uidElement);
-            }
-
-            for(String element:xmlElements){
-                if (!element.equals(xmlElements.get(12))) {
-                    Collection<Object> values = data_fields.get(element);
-                    if (values!=null && values.size()>0) {
-                        Object next = values.iterator()
-                                .next();
-                        String value =(String) next;
-                        if (value != null && !value.isEmpty()) {
-                            Element eValue = doc.createElement(element);
-                            if(element.equals("keywords"))
-                                eValue.setTextContent(StringUtils.join(values, ','));
-                            else
-                                eValue.setTextContent(value);
-                            Collection<Object> boostElement = data_fields
-                                    .get(element + "_boost");
-                            if(boostElement!=null && boostElement.size()>0){
-                                String value_boost = (String) boostElement.iterator()
-                                        .next();
-                                eValue.setAttribute(BOOST_ATTRIBUTE, "" + value_boost);
-                            }
-                            document.appendChild(eValue);
-                        }
-                    }
-                }
-            }
+			// Uid
+			if (uid != null && !uid.isEmpty()) {
+				Element uidElement = doc.createElement(xmlElements.get(13));
+				uidElement.setTextContent(uid);
+				document.appendChild(uidElement);
+			}
 
-			// Metadata
-            Collection<Object> metadataSet = data_fields
-                    .get(xmlElements.get(12));
-            if(metadataSet!=null && metadataSet.size()>0){
-			Multimap<String, String> metadata = (Multimap<String, String>) metadataSet.iterator().next();
-			if (metadata != null && !metadata.isEmpty()) {
-				for (String name : metadata.keySet())
-					for (String value : metadata.get(name)) {
-						Element metaElement = doc.createElement(xmlElements.get(12));
-						metaElement.setAttribute(NAME_ATTRIBUTE, name);
-						metaElement.setTextContent(value);
-						document.appendChild(metaElement);
+			for(String element:xmlElements){
+				if (!element.equals(xmlElements.get(12))) {
+					Collection<Object> values = data_fields.get(element);
+					if (values!=null && values.size()>0) {
+						Object next = values.iterator()
+								.next();
+						String value =(String) next;
+						if (value != null && !value.isEmpty()) {
+							Element eValue = doc.createElement(element);
+							if(element.equals("keywords"))
+								eValue.setTextContent(StringUtils.join(values, ','));
+							else
+								eValue.setTextContent(value);
+							Collection<Object> boostElement = data_fields
+									.get(element + "_boost");
+							if(boostElement!=null && boostElement.size()>0){
+								String value_boost = (String) boostElement.iterator()
+										.next();
+								eValue.setAttribute(BOOST_ATTRIBUTE, "" + value_boost);
+							}
+							document.appendChild(eValue);
+						}
 					}
-			}  }
+				}
+			}
+
+			// Metadata
+			Collection<Object> metadataSet = data_fields
+					.get(xmlElements.get(12));
+			if(metadataSet!=null && metadataSet.size()>0){
+				Multimap<String, String> metadata = (Multimap<String, String>) metadataSet.iterator().next();
+				if (metadata != null && !metadata.isEmpty()) {
+					for (String name : metadata.keySet())
+						for (String value : metadata.get(name)) {
+							Element metaElement = doc.createElement(xmlElements.get(12));
+							metaElement.setAttribute(NAME_ATTRIBUTE, name);
+							metaElement.setTextContent(value);
+							document.appendChild(metaElement);
+						}
+				}  }
 		}
 
 		return getStringFromDocument(doc);
-
 	}
 
 	/**
@@ -283,7 +366,7 @@ public class SearchBloxDocument {
 			StreamResult result = new StreamResult(writer);
 			TransformerFactory tf = TransformerFactory.newInstance();
 			Transformer transformer = tf.newTransformer();
-//			transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
+			//			transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
 			transformer.transform(domSource, result);
 			return writer.toString();
 		} catch (TransformerException ex) {

Modified: manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/test/java/org/apache/manifoldcf/agents/output/searchblox/tests/SearchBloxDocumentTest.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/test/java/org/apache/manifoldcf/agents/output/searchblox/tests/SearchBloxDocumentTest.java?rev=1667851&r1=1667850&r2=1667851&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/test/java/org/apache/manifoldcf/agents/output/searchblox/tests/SearchBloxDocumentTest.java (original)
+++ manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/test/java/org/apache/manifoldcf/agents/output/searchblox/tests/SearchBloxDocumentTest.java Thu Mar 19 19:06:41 2015
@@ -1,19 +1,3 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
 package org.apache.manifoldcf.agents.output.searchblox.tests;
 
 import com.google.common.collect.Lists;

Modified: manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/pom.xml
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/pom.xml?rev=1667851&r1=1667850&r2=1667851&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/pom.xml (original)
+++ manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/pom.xml Thu Mar 19 19:06:41 2015
@@ -278,7 +278,7 @@
 		<dependency>
 			<groupId>org.jboss.resteasy</groupId>
 			<artifactId>resteasy-client</artifactId>
-			<version>3.0.8.Final</version>
+			<version>3.0.9.Final</version>
 			<exclusions>
 				<exclusion>
 					<groupId>org.slf4j</groupId>