You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by rh...@apache.org on 2015/03/19 20:06:41 UTC
svn commit: r1667851 - in
/manifoldcf/branches/CONNECTORS-1168/connectors/searchblox: ./
connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/
connector/src/test/java/org/apache/manifoldcf/agents/output/searchblox/tests/
Author: rharo
Date: Thu Mar 19 19:06:41 2015
New Revision: 1667851
URL: http://svn.apache.org/r1667851
Log:
CONNECTORS-1168: Added JSON indecing
Modified:
manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxClient.java
manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxDocument.java
manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/test/java/org/apache/manifoldcf/agents/output/searchblox/tests/SearchBloxDocumentTest.java
manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/pom.xml
Modified: manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxClient.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxClient.java?rev=1667851&r1=1667850&r2=1667851&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxClient.java (original)
+++ manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxClient.java Thu Mar 19 19:06:41 2015
@@ -19,6 +19,8 @@ package org.apache.manifoldcf.agents.out
import java.io.IOException;
import java.io.StringReader;
import java.util.UUID;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import javax.ws.rs.client.Client;
import javax.ws.rs.client.ClientBuilder;
@@ -29,9 +31,12 @@ import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import javax.ws.rs.core.UriBuilder;
+import org.apache.manifoldcf.agents.output.searchblox.SearchBloxDocument.IndexingFormat;
import org.apache.manifoldcf.crawler.system.Logging;
import org.apache.xerces.parsers.DOMParser;
import org.jboss.resteasy.plugins.providers.StringTextStar;
+import org.json.JSONException;
+import org.json.JSONObject;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
@@ -58,6 +63,8 @@ public class SearchBloxClient {
private static final String CLEAR_PATH = "clear";
private static final String STATUS_NODE = "statuscode";
+
+ private static final Pattern status_pattern = Pattern.compile("^status code\\s:\\s([0-9]+)$");
public static enum ResponseCode {
DOCUMENT_INDEXED(100),
@@ -168,16 +175,21 @@ public class SearchBloxClient {
WebTarget target = client.target(uri.build());
Builder httpRequest = target.request();
- httpRequest.accept(MediaType.TEXT_XML_TYPE);
+ if (iFormat == SearchBloxDocument.IndexingFormat.JSON) {
+ httpRequest.accept(MediaType.APPLICATION_JSON_TYPE);
+ }else{
+ httpRequest.accept(MediaType.APPLICATION_XML_TYPE);
+ }
+
document.apiKey = this.apikey;
String body = document.toString(iFormat, action);
- Logging.connectors.debug("XML Document for document: " + document.uid +":" + body);
+ Logging.connectors.debug("Document for document: " + document.uid +":" + body);
MediaType type = MediaType.TEXT_PLAIN_TYPE;
-// if (iFormat == SearchBloxDocument.IndexingFormat.JSON) {
-// type = MediaType.APPLICATION_JSON_TYPE;
-// }
+ if (iFormat == SearchBloxDocument.IndexingFormat.JSON) {
+ type = MediaType.APPLICATION_JSON_TYPE;
+ }
Entity<String> entity = Entity.entity(body, type);
@@ -191,23 +203,45 @@ public class SearchBloxClient {
return ResponseCode.SERVER_UNREACHABLE;
}
- String xmlResponse = response.readEntity(String.class);
- DOMParser parser = new DOMParser();
- try {
- parser.parse(new InputSource(new StringReader(xmlResponse)));
- } catch (SAXException | IOException e) {
- Logging.connectors.error("[Response parsing] Dom parsing error", e);
- throw new SearchBloxException(e);
- }
- Document doc = parser.getDocument();
- NodeList nodeList = doc.getElementsByTagName(STATUS_NODE);
- if (nodeList == null || nodeList.getLength() == 0) {
- String message = "[Response Parsing] Status code not found";
- Logging.connectors.error(message);
- throw new SearchBloxException(message);
- }
- String codeStr = nodeList.item(0).getTextContent();
- int statusCode = Integer.parseInt(codeStr);
- return ResponseCode.getCodeFromValue(statusCode);
+ String rawResponse = response.readEntity(String.class);
+ if(iFormat == IndexingFormat.XML){
+ DOMParser parser = new DOMParser();
+ try {
+ parser.parse(new InputSource(new StringReader(rawResponse)));
+ } catch (SAXException | IOException e) {
+ Logging.connectors.error("[Response parsing] Dom parsing error", e);
+ throw new SearchBloxException(e);
+ }
+ Document doc = parser.getDocument();
+ NodeList nodeList = doc.getElementsByTagName(STATUS_NODE);
+ if (nodeList == null || nodeList.getLength() == 0) {
+ String message = "[Response Parsing] Status code not found";
+ Logging.connectors.error(message);
+ throw new SearchBloxException(message);
+ }
+ String codeStr = nodeList.item(0).getTextContent();
+ int statusCode = Integer.parseInt(codeStr);
+ return ResponseCode.getCodeFromValue(statusCode);
+ }else{
+// try {
+// JSONObject json = new JSONObject(rawResponse);
+// String codeStr = json.getString(STATUS_NODE);
+ Matcher matcher = status_pattern.matcher(rawResponse);
+ String codeStr = null;
+ if(matcher.find())
+ codeStr = matcher.group();
+ if(codeStr == null){
+ String message = "[Response parsing] Resoponse code parsing error";
+ Logging.connectors.error(message);
+ throw new SearchBloxException(message);
+ }
+
+ int statusCode = Integer.parseInt(codeStr);
+ return ResponseCode.getCodeFromValue(statusCode);
+// } catch (JSONException e) {
+// Logging.connectors.error("[Response parsing] Response JSON parsing error", e);
+// throw new SearchBloxException(e);
+// }
+ }
}
}
Modified: manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxDocument.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxDocument.java?rev=1667851&r1=1667850&r2=1667851&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxDocument.java (original)
+++ manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxDocument.java Thu Mar 19 19:06:41 2015
@@ -16,16 +16,15 @@
*/
package org.apache.manifoldcf.agents.output.searchblox;
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Multimap;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang.StringUtils;
-import org.apache.manifoldcf.agents.interfaces.RepositoryDocument;
-import org.apache.manifoldcf.crawler.system.Logging;
-import org.jsoup.Jsoup;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.text.SimpleDateFormat;
+import java.util.Collection;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
@@ -34,11 +33,21 @@ import javax.xml.transform.TransformerEx
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.text.SimpleDateFormat;
-import java.util.*;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.manifoldcf.agents.interfaces.RepositoryDocument;
+import org.apache.manifoldcf.crawler.system.Logging;
+import org.json.JSONArray;
+import org.json.JSONException;
+import org.json.JSONObject;
+import org.jsoup.Jsoup;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multimap;
/**
* "Package" class modeling a SearchBox document as a POJO
@@ -59,8 +68,8 @@ public class SearchBloxDocument {
public enum DocumentAction {
ADD_UPDATE, DELETE, STATUS, CREATE, CLEAR
}
- static final List<String> xmlElements= Lists.newArrayList("searchblox","document","url","title","keywords","content","description","lastmodified","size",
- "alpha","contenttype","category","meta","uid");
+ static final List<String> xmlElements= Lists.newArrayList("searchblox","document","url","title","keywords","content","description","lastmodified","size",
+ "alpha","contenttype","category","meta","uid");
static final String COLNAME_ATTRIBUTE = "colname";
static final String APIKEY_ATTRIBUTE = "apikey";
@@ -100,7 +109,7 @@ public class SearchBloxDocument {
Date date = rd.getModifiedDate();
if(date!=null){
data_fields.put(xmlElements.get(7),
- dateFormat.format(rd.getModifiedDate()));
+ dateFormat.format(rd.getModifiedDate()));
}
// content
@@ -112,25 +121,25 @@ public class SearchBloxDocument {
content = this.buildString(rd.getBinaryStream());
} catch (IOException e) {
Logging.connectors
- .error("[Parsing Content]Content is not text plain, verify you are properly using Apache Tika Transformer",
- e);
+ .error("[Parsing Content]Content is not text plain, verify you are properly using Apache Tika Transformer",
+ e);
}
data_fields.put(xmlElements.get(5), this.clean(content));
// Content Type
data_fields.put(xmlElements.get(10), rd.getMimeType());
-
+
// Size
data_fields.put(xmlElements.get(8), "" + rd.getBinaryLength());
// Boosting
for(String boostId:args.keySet()){
- if(boostId.endsWith("_boost")){
- List<String> argBoost = args.get(boostId);
- if(argBoost!=null && !argBoost.isEmpty())
- data_fields.put(boostId,argBoost.get(0));
- }
- }
+ if(boostId.endsWith("_boost")){
+ List<String> argBoost = args.get(boostId);
+ if(argBoost!=null && !argBoost.isEmpty())
+ data_fields.put(boostId,argBoost.get(0));
+ }
+ }
// Metadata
Multimap<String, String> metadata = HashMultimap.create();
@@ -159,32 +168,32 @@ public class SearchBloxDocument {
}
// ACLS must be stored as metadata, as Searchblox use that construct to index custom named fields
- //the approach has been implemented and tested live
+ //the approach has been implemented and tested live
Iterator<String> aclTypes = rd.securityTypesIterator();
while (aclTypes.hasNext()) {
String aclType = aclTypes.next();
String[] allow_tokens = rd.getSecurityACL(aclType);
for (String token : allow_tokens)
metadata.put(aclType+"_allow", token);
- String[] deny_tokens = rd.getSecurityDenyACL(aclType);
- for (String token : deny_tokens)
- metadata.put(aclType+"_deny", token);
+ String[] deny_tokens = rd.getSecurityDenyACL(aclType);
+ for (String token : deny_tokens)
+ metadata.put(aclType+"_deny", token);
}
- data_fields.put(xmlElements.get(12), metadata);
+ data_fields.put(xmlElements.get(12), metadata);
}
- /**
- * Clean a String from html tags or break lines
- * @param content
- * @return
- */
- private String clean(String content) {
- content = content.replaceAll("(\r\n|\n)", " ");
- String cleanContent= Jsoup.parseBodyFragment(content).text();
- return cleanContent;
- }
+ /**
+ * Clean a String from html tags or break lines
+ * @param content
+ * @return
+ */
+ private String clean(String content) {
+ content = content.replaceAll("(\r\n|\n)", " ");
+ String cleanContent= Jsoup.parseBodyFragment(content).text();
+ return cleanContent;
+ }
- private String buildString(InputStream binaryStream) throws IOException {
+ private String buildString(InputStream binaryStream) throws IOException {
StringWriter writer = new StringWriter();
IOUtils.copy(binaryStream, writer, "UTF-8");
return writer.toString();
@@ -192,6 +201,81 @@ public class SearchBloxDocument {
public String toString(IndexingFormat format, DocumentAction action)
throws SearchBloxException {
+ if(format == IndexingFormat.XML)
+ return toStringXML(action);
+ else
+ return toStringJSON(action);
+ }
+
+ private String toStringJSON(DocumentAction action) throws SearchBloxException {
+ JSONObject result = new JSONObject();
+ if (apiKey == null)
+ throw new SearchBloxException(
+ "The API Key for accessing SearchBlox Server CAN'T be NULL");
+ try {
+ result.put(APIKEY_ATTRIBUTE, apiKey);
+
+ JSONObject document = new JSONObject();
+ if (colName == null)
+ throw new SearchBloxException(
+ "The Collection Name of the SearchBlox Server CAN'T be NULL");
+ document.put(COLNAME_ATTRIBUTE, colName);
+ document.put(UID_ATTRIBUTE, uid);
+
+ if(action == DocumentAction.ADD_UPDATE){
+ for(String element:xmlElements){
+ if (!element.equals(xmlElements.get(12))) {
+ Collection<Object> values = data_fields.get(element);
+ if (values!=null && values.size()>0) {
+ Object next = values.iterator()
+ .next();
+ String value =(String) next;
+ if (value != null && !value.isEmpty()) {
+ if(element.equals("keywords"))
+ document.put(element, StringUtils.join(values, ','));
+ else
+ document.put(element, value);
+// Collection<Object> boostElement = data_fields
+// .get(element + "_boost");
+// if(boostElement!=null && boostElement.size()>0){
+// String value_boost = (String) boostElement.iterator()
+// .next();
+// eValue.setAttribute(BOOST_ATTRIBUTE, "" + value_boost);
+// }
+
+ }
+ }
+ }
+ }
+
+ // Metadata
+ Collection<Object> metadataSet = data_fields
+ .get(xmlElements.get(12));
+ JSONObject metaObject = new JSONObject();
+ if(metadataSet!=null && metadataSet.size()>0){
+ Multimap<String, String> metadata = (Multimap<String, String>) metadataSet.iterator().next();
+ if (metadata != null && !metadata.isEmpty()) {
+ for (String name : metadata.keySet()){
+ JSONArray nextMetadata = new JSONArray();
+ for (String value : metadata.get(name)) {
+ nextMetadata.put(value);
+ }
+ metaObject.put(name, nextMetadata);
+ }
+ }
+ }
+ document.put(xmlElements.get(12), metaObject);
+ }
+
+ result.put(xmlElements.get(1), document);
+
+ } catch (JSONException e) {
+ throw new SearchBloxException("Error while building Document JSON object", e);
+ }
+ return result.toString();
+ }
+
+ private String toStringXML(DocumentAction action) throws SearchBloxException{
Document doc = null;
try {
doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
@@ -213,62 +297,61 @@ public class SearchBloxDocument {
throw new SearchBloxException(
"The Collection Name of the SearchBlox Server CAN'T be NULL");
document.setAttribute(COLNAME_ATTRIBUTE, colName);
- if(action == DocumentAction.DELETE)
- document.setAttribute(UID_ATTRIBUTE,uid);
+ if(action == DocumentAction.DELETE)
+ document.setAttribute(UID_ATTRIBUTE,uid);
root.appendChild(document);
if (action == DocumentAction.ADD_UPDATE) {
- // Uid
- if (uid != null && !uid.isEmpty()) {
- Element uidElement = doc.createElement(xmlElements.get(13));
- uidElement.setTextContent(uid);
- document.appendChild(uidElement);
- }
-
- for(String element:xmlElements){
- if (!element.equals(xmlElements.get(12))) {
- Collection<Object> values = data_fields.get(element);
- if (values!=null && values.size()>0) {
- Object next = values.iterator()
- .next();
- String value =(String) next;
- if (value != null && !value.isEmpty()) {
- Element eValue = doc.createElement(element);
- if(element.equals("keywords"))
- eValue.setTextContent(StringUtils.join(values, ','));
- else
- eValue.setTextContent(value);
- Collection<Object> boostElement = data_fields
- .get(element + "_boost");
- if(boostElement!=null && boostElement.size()>0){
- String value_boost = (String) boostElement.iterator()
- .next();
- eValue.setAttribute(BOOST_ATTRIBUTE, "" + value_boost);
- }
- document.appendChild(eValue);
- }
- }
- }
- }
+ // Uid
+ if (uid != null && !uid.isEmpty()) {
+ Element uidElement = doc.createElement(xmlElements.get(13));
+ uidElement.setTextContent(uid);
+ document.appendChild(uidElement);
+ }
- // Metadata
- Collection<Object> metadataSet = data_fields
- .get(xmlElements.get(12));
- if(metadataSet!=null && metadataSet.size()>0){
- Multimap<String, String> metadata = (Multimap<String, String>) metadataSet.iterator().next();
- if (metadata != null && !metadata.isEmpty()) {
- for (String name : metadata.keySet())
- for (String value : metadata.get(name)) {
- Element metaElement = doc.createElement(xmlElements.get(12));
- metaElement.setAttribute(NAME_ATTRIBUTE, name);
- metaElement.setTextContent(value);
- document.appendChild(metaElement);
+ for(String element:xmlElements){
+ if (!element.equals(xmlElements.get(12))) {
+ Collection<Object> values = data_fields.get(element);
+ if (values!=null && values.size()>0) {
+ Object next = values.iterator()
+ .next();
+ String value =(String) next;
+ if (value != null && !value.isEmpty()) {
+ Element eValue = doc.createElement(element);
+ if(element.equals("keywords"))
+ eValue.setTextContent(StringUtils.join(values, ','));
+ else
+ eValue.setTextContent(value);
+ Collection<Object> boostElement = data_fields
+ .get(element + "_boost");
+ if(boostElement!=null && boostElement.size()>0){
+ String value_boost = (String) boostElement.iterator()
+ .next();
+ eValue.setAttribute(BOOST_ATTRIBUTE, "" + value_boost);
+ }
+ document.appendChild(eValue);
+ }
}
- } }
+ }
+ }
+
+ // Metadata
+ Collection<Object> metadataSet = data_fields
+ .get(xmlElements.get(12));
+ if(metadataSet!=null && metadataSet.size()>0){
+ Multimap<String, String> metadata = (Multimap<String, String>) metadataSet.iterator().next();
+ if (metadata != null && !metadata.isEmpty()) {
+ for (String name : metadata.keySet())
+ for (String value : metadata.get(name)) {
+ Element metaElement = doc.createElement(xmlElements.get(12));
+ metaElement.setAttribute(NAME_ATTRIBUTE, name);
+ metaElement.setTextContent(value);
+ document.appendChild(metaElement);
+ }
+ } }
}
return getStringFromDocument(doc);
-
}
/**
@@ -283,7 +366,7 @@ public class SearchBloxDocument {
StreamResult result = new StreamResult(writer);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer transformer = tf.newTransformer();
-// transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
+ // transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
transformer.transform(domSource, result);
return writer.toString();
} catch (TransformerException ex) {
Modified: manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/test/java/org/apache/manifoldcf/agents/output/searchblox/tests/SearchBloxDocumentTest.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/test/java/org/apache/manifoldcf/agents/output/searchblox/tests/SearchBloxDocumentTest.java?rev=1667851&r1=1667850&r2=1667851&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/test/java/org/apache/manifoldcf/agents/output/searchblox/tests/SearchBloxDocumentTest.java (original)
+++ manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/connector/src/test/java/org/apache/manifoldcf/agents/output/searchblox/tests/SearchBloxDocumentTest.java Thu Mar 19 19:06:41 2015
@@ -1,19 +1,3 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
package org.apache.manifoldcf.agents.output.searchblox.tests;
import com.google.common.collect.Lists;
Modified: manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/pom.xml
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/pom.xml?rev=1667851&r1=1667850&r2=1667851&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/pom.xml (original)
+++ manifoldcf/branches/CONNECTORS-1168/connectors/searchblox/pom.xml Thu Mar 19 19:06:41 2015
@@ -278,7 +278,7 @@
<dependency>
<groupId>org.jboss.resteasy</groupId>
<artifactId>resteasy-client</artifactId>
- <version>3.0.8.Final</version>
+ <version>3.0.9.Final</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>