You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2018/03/16 16:31:27 UTC

svn commit: r1827009 - in /manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector: ./ connector/ connector/src/ connector/src/main/ connector/src/main/java/ connector/src/main/java/com/ connector/src/main/java/com/francelabs/ ...

Author: kwright
Date: Fri Mar 16 16:31:26 2018
New Revision: 1827009

URL: http://svn.apache.org/viewvc?rev=1827009&view=rev
Log:
Commit initial contribution (with path changes)

Added:
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/build.xml
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/com/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/com/francelabs/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/com/francelabs/datafari/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/Messages.java
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/HtmlExtractorException.java
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/RegexException.java
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/com/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/com/francelabs/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/com/francelabs/datafari/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/com/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/com/francelabs/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/com/francelabs/datafari/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editConfiguration.js
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification.js
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewConfiguration.html
    manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html

Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/build.xml
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/build.xml?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/build.xml (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/build.xml Fri Mar 16 16:31:26 2018
@@ -0,0 +1,59 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project name="html" default="all">
+
+    <property environment="env"/>
+    <condition property="mcf-dist" value="${env.MCFDISTPATH}">
+        <isset property="env.MCFDISTPATH"/>
+    </condition>
+    <property name="abs-dist" location="../../dist"/>
+    <condition property="mcf-dist" value="${abs-dist}">
+        <not>
+            <isset property="env.MCFDISTPATH"/>
+        </not>
+    </condition>
+
+    <import file="${mcf-dist}/connector-build.xml"/>
+    
+    <path id="connector-classpath">
+        <path refid="mcf-connector-build.connector-classpath"/>
+        <fileset dir="../../lib">
+            <include name="jsoup*.jar"/>
+            
+        </fileset>
+    </path>
+
+    <target name="lib" depends="mcf-connector-build.lib,precompile-check" if="canBuild">
+        <mkdir dir="dist/lib"/>
+        <copy todir="dist/lib">
+            <fileset dir="../../lib">
+                 <include name="jsoup*.jar"/>
+            </fileset>
+        </copy>
+    </target>
+
+    <target name="deliver-connector" depends="mcf-connector-build.deliver-connector">
+        <antcall target="general-add-transformation-connector">
+            <param name="connector-label" value="Html extractor"/>
+            <param name="connector-class" value="org.apache.manifoldcf.agents.transformers.htmlextractor.HtmlExtractor"/>
+        </antcall>
+    </target>
+
+</project>
+
+

Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java Fri Mar 16 16:31:26 2018
@@ -0,0 +1,753 @@
+package com.francelabs.datafari.htmlextractor;
+
+/* $Id$ */
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.system.Logging;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+
+
+import com.francelabs.datafari.htmlextractor.exception.RegexException;
+
+import org.apache.manifoldcf.agents.interfaces.*;
+
+import java.io.*;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+import java.util.Map.Entry;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+/** This connector works as a transformation connector, but does nothing other than logging.
+ *
+ */
+public class HtmlExtractor extends org.apache.manifoldcf.agents.transformation.BaseTransformationConnector
+{
+
+	public static final String _rcsid = "@(#)$Id$";
+
+	protected static final String ACTIVITY_PROCESS = "process";
+
+	protected static final String[] activitiesList = new String[]{ACTIVITY_PROCESS};
+
+	/**
+	 * Forward to the javascript to check the specification parameters for the job
+	 */
+	private static final String EDIT_CONFIGURATION_JS = "editConfiguration.js";
+
+	private static final String VIEW_CONFIGURATION_HTML = "viewConfiguration.html";
+	private static final String EDIT_SPECIFICATION_JS = "editSpecification.js";
+	private static final String VIEW_SPECIFICATION_HTML = "viewSpecification.html";
+	private static final String EDIT_SPECIFICATION_HTML_EXTRACTOR_HTML = "editSpecification_HTML_Extractor.html";
+
+
+	/** We handle up to 64K in memory; after that we go to disk. */
+	protected static final long inMemoryMaximumFile = 65536;
+
+	/** Return a list of activities that this connector generates.
+	 * The connector does NOT need to be connected before this method is called.
+	 *@return the set of activities.
+	 */
+	@Override
+	public String[] getActivitiesList()
+	{
+		return activitiesList;
+	}
+
+	/** Add (or replace) a document in the output data store using the connector.
+	 * This method presumes that the connector object has been configured, and it is thus able to communicate with the output data store should that be
+	 * necessary.
+	 * The OutputSpecification is *not* provided to this method, because the goal is consistency, and if output is done it must be consistent with the
+	 * output description, since that was what was partly used to determine if output should be taking place.  So it may be necessary for this method to decode
+	 * an output description string in order to determine what should be done.
+	 *@param documentURI is the URI of the document.  The URI is presumed to be the unique identifier which the output data store will use to process
+	 * and serve the document.  This URI is constructed by the repository connector which fetches the document, and is thus universal across all output connectors.
+	 *@param outputDescription is the description string that was constructed for this document by the getOutputDescription() method.
+	 *@param document is the document data to be processed (handed to the output data store).
+	 *@param authorityNameString is the name of the authority responsible for authorizing any access tokens passed in with the repository document.  May be null.
+	 *@param activities is the handle to an object that the implementer of a pipeline connector may use to perform operations, such as logging processing activity,
+	 * or sending a modified document to the next stage in the pipeline.
+	 *@return the document status (accepted or permanently rejected).
+	 *@throws IOException only if there's a stream error reading the document data.
+	 */
+	@Override
+	public int addOrReplaceDocumentWithException(String documentURI, VersionContext pipelineDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
+			throws ManifoldCFException, ServiceInterruption, IOException
+	{
+		long startTime = System.currentTimeMillis();
+		String resultCode = "OK";
+		String description = null;
+		Long length = null;
+
+		final SpecPacker sp = new SpecPacker(pipelineDescription.getSpecification());
+
+
+		Logging.root.info("Processing by HTML Extractor");
+		if (!(document.getMimeType().startsWith("text/html")) || (document.getMimeType().startsWith("application/xhtml+xml"))){
+			Logging.root.warn("no processing, mime type not html");
+			resultCode = "NO HTML";
+
+		}
+
+		else {
+			try
+			{
+				Logging.root.info("Document recognized as HTML - processing");
+				long binaryLength = document.getBinaryLength();
+
+
+				length =  new Long(binaryLength);
+
+				/*
+
+				DestinationStorage ds;
+
+				if (document.getBinaryLength() <= inMemoryMaximumFile)
+				{
+					ds = new MemoryDestinationStorage((int)document.getBinaryLength());
+				}
+				else
+				{
+					ds = new FileDestinationStorage();
+				}
+				try
+			      {
+			        OutputStream os = ds.getOutputStream();
+				 */
+
+
+				//TODO
+				/* Add an option to keep HTML markup of the extracted text or not - 
+				 * in case for example of processing by Tika after this transformation connector
+				 * 
+				 */
+				Hashtable<String,String> metadataExtracted = new Hashtable<String,String>();
+				metadataExtracted = JsoupProcessing.extractTextAndMetadataHtmlDocument(document.getBinaryStream(),sp.includeFilters.get(0), sp.excludeFilters);
+				InputStream newStream = new ByteArrayInputStream(metadataExtracted.get("extractedDoc").getBytes(StandardCharsets.UTF_8));
+				int lenghtNewStream = newStream.available();
+				document.setBinary(newStream, lenghtNewStream);
+				Iterator<Entry<String, String>> it;
+				Map.Entry<String,String> entry;
+
+				it = metadataExtracted.entrySet().iterator();
+				while (it.hasNext()) {
+					entry = it.next();
+					if (entry.getKey()!="extractedDoc")
+						document.addField("jsoup_"+entry.getKey(), entry.getValue());
+
+				}
+
+				return activities.sendDocument(documentURI,document);
+			}
+			catch (ServiceInterruption e)
+			{
+				resultCode = "SERVICEINTERRUPTION";
+				description = e.getMessage();
+				throw e;
+			}
+			catch (ManifoldCFException e)
+			{
+				resultCode = "EXCEPTION";
+				description = e.getMessage();
+				throw e;
+			}
+			catch (IOException e)
+			{
+				resultCode = "IOEXCEPTION";
+				description = e.getMessage();
+				throw e;
+			}
+
+			catch (Exception e)
+			{
+
+				resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
+				description = e.getMessage();
+			}
+			finally
+			{
+				activities.recordActivity(new Long(startTime), ACTIVITY_PROCESS, length, documentURI,
+						resultCode, description);
+			}
+
+
+		}
+
+		return activities.sendDocument(documentURI,document);
+	}
+
+
+	protected static interface DestinationStorage
+	{
+		/** Get the output stream to write to.  Caller should explicitly close this stream when done writing.
+		 */
+		public OutputStream getOutputStream()
+				throws ManifoldCFException;
+
+		/** Get new binary length.
+		 */
+		public long getBinaryLength()
+				throws ManifoldCFException;
+
+		/** Get the input stream to read from.  Caller should explicitly close this stream when done reading.
+		 */
+		public InputStream getInputStream()
+				throws ManifoldCFException;
+
+		/** Close the object and clean up everything.
+		 * This should be called when the data is no longer needed.
+		 */
+		public void close()
+				throws ManifoldCFException;
+	}
+
+	protected static class FileDestinationStorage implements DestinationStorage
+	{
+		protected final File outputFile;
+		protected final OutputStream outputStream;
+
+		public FileDestinationStorage()
+				throws ManifoldCFException
+		{
+			File outputFile;
+			OutputStream outputStream;
+			try
+			{
+				outputFile = File.createTempFile("mcftika","tmp");
+				outputStream = new FileOutputStream(outputFile);
+			}
+			catch (IOException e)
+			{
+				handleIOException(e);
+				outputFile = null;
+				outputStream = null;
+			}
+			this.outputFile = outputFile;
+			this.outputStream = outputStream;
+		}
+
+		@Override
+		public OutputStream getOutputStream()
+				throws ManifoldCFException
+		{
+			return outputStream;
+		}
+
+		/** Get new binary length.
+		 */
+		@Override
+		public long getBinaryLength()
+				throws ManifoldCFException
+		{
+			return outputFile.length();
+		}
+
+		/** Get the input stream to read from.  Caller should explicitly close this stream when done reading.
+		 */
+		@Override
+		public InputStream getInputStream()
+				throws ManifoldCFException
+		{
+			try
+			{
+				return new FileInputStream(outputFile);
+			}
+			catch (IOException e)
+			{
+				handleIOException(e);
+				return null;
+			}
+		}
+
+		private void handleIOException(IOException e) {
+			// TODO Auto-generated method stub
+
+		}
+
+		/** Close the object and clean up everything.
+		 * This should be called when the data is no longer needed.
+		 */
+		@Override
+		public void close()
+				throws ManifoldCFException
+		{
+			outputFile.delete();
+		}
+
+	}
+
+	protected static class MemoryDestinationStorage implements DestinationStorage
+	{
+		protected final ByteArrayOutputStream outputStream;
+
+		public MemoryDestinationStorage(int sizeHint)
+		{
+			outputStream = new ByteArrayOutputStream(sizeHint);
+		}
+
+		@Override
+		public OutputStream getOutputStream()
+				throws ManifoldCFException
+		{
+			return outputStream;
+		}
+
+		/** Get new binary length.
+		 */
+		@Override
+		public long getBinaryLength()
+				throws ManifoldCFException
+		{
+			return outputStream.size();
+		}
+
+		/** Get the input stream to read from.  Caller should explicitly close this stream when done reading.
+		 */
+		@Override
+		public InputStream getInputStream()
+				throws ManifoldCFException
+		{
+			return new ByteArrayInputStream(outputStream.toByteArray());
+		}
+
+		/** Close the object and clean up everything.
+		 * This should be called when the data is no longer needed.
+		 */
+		public void close()
+				throws ManifoldCFException
+		{
+		}
+		protected static int handleIOException(IOException e)
+				throws ManifoldCFException
+		{
+			// IOException reading from our local storage...
+			if (e instanceof InterruptedIOException)
+				throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+			throw new ManifoldCFException(e.getMessage(),e);
+		}
+
+	}
+	/**
+	 * Test if there is at least one regular expression that match with the
+	 * provided sting
+	 *
+	 * @param regexList
+	 *          the list of regular expressions
+	 * @param str
+	 *          the string to test
+	 * @return the first matching regex found or null if no matching regex
+	 */
+	private String matchingRegex(final List<String> regexList, final String str) throws RegexException {
+		for (final String regex : regexList) {
+			try {
+				final Pattern pattern = Pattern.compile(regex);
+				final Matcher matcher = pattern.matcher(str);
+				if (matcher.find()) {
+					return regex;
+				}
+			} catch (final PatternSyntaxException e) {
+				throw new RegexException(regex, "Invalid regular expression");
+			}
+		}
+		return null;
+	}
+
+
+
+
+
+
+
+	/**
+	 * Output the configuration header section. This method is called in the head
+	 * section of the connector's configuration page. Its purpose is to add the
+	 * required tabs to the list, and to output any javascript methods that might
+	 * be needed by the configuration editing HTML.
+	 *
+	 * @param threadContext
+	 *          is the local thread context.
+	 * @param out
+	 *          is the output to which any HTML should be sent.
+	 * @param parameters
+	 *          are the configuration parameters, as they currently exist, for
+	 *          this connection being configured.
+	 * @param tabsArray
+	 *          is an array of tab names. Add to this array any tab names that are
+	 *          specific to the connector.
+	 */
+	@Override
+	public void outputConfigurationHeader(final IThreadContext threadContext, final IHTTPOutput out, final Locale locale,
+			final ConfigParams parameters, final List<String> tabsArray) throws ManifoldCFException, IOException {
+
+		Messages.outputResourceWithVelocity(out, locale, EDIT_CONFIGURATION_JS, null);
+	}
+
+	/**
+	 * Output the configuration body section. This method is called in the body
+	 * section of the connector's configuration page. Its purpose is to present
+	 * the required form elements for editing. The coder can presume that the HTML
+	 * that is output from this configuration will be within appropriate <html>,
+	 * <body>, and <form> tags. The name of the form is "editconnection".
+	 *
+	 * @param threadContext
+	 *          is the local thread context.
+	 * @param out
+	 *          is the output to which any HTML should be sent.
+	 * @param parameters
+	 *          are the configuration parameters, as they currently exist, for
+	 *          this connection being configured.
+	 * @param tabName
+	 *          is the current tab name.
+	 */
+	@Override
+	public void outputConfigurationBody(final IThreadContext threadContext, final IHTTPOutput out, final Locale locale,
+			final ConfigParams parameters, final String tabName) throws ManifoldCFException, IOException {
+		final Map<String, Object> velocityContext = new HashMap<>();
+		velocityContext.put("TabName", tabName);
+
+	}
+
+	/**
+	 * Process a configuration post. This method is called at the start of the
+	 * connector's configuration page, whenever there is a possibility that form
+	 * data for a connection has been posted. Its purpose is to gather form
+	 * information and modify the configuration parameters accordingly. The name
+	 * of the posted form is "editconnection".
+	 *
+	 * @param threadContext
+	 *          is the local thread context.
+	 * @param variableContext
+	 *          is the set of variables available from the post, including binary
+	 *          file post information.
+	 * @param parameters
+	 *          are the configuration parameters, as they currently exist, for
+	 *          this connection being configured.
+	 * @return null if all is well, or a string error message if there is an error
+	 *         that should prevent saving of the connection (and cause a
+	 *         redirection to an error page).
+	 */
+	@Override
+	public String processConfigurationPost(final IThreadContext threadContext, final IPostParameters variableContext,
+			final Locale locale, final ConfigParams parameters) throws ManifoldCFException {
+
+
+		return null;
+	}
+
+	/**
+	 * View configuration. This method is called in the body section of the
+	 * connector's view configuration page. Its purpose is to present the
+	 * connection information to the user. The coder can presume that the HTML
+	 * that is output from this configuration will be within appropriate <html>
+	 * and <body> tags.
+	 *
+	 * @param threadContext
+	 *          is the local thread context.
+	 * @param out
+	 *          is the output to which any HTML should be sent.
+	 * @param parameters
+	 *          are the configuration parameters, as they currently exist, for
+	 *          this connection being configured.
+	 */
+	@Override
+	public void viewConfiguration(final IThreadContext threadContext, final IHTTPOutput out, final Locale locale,
+			final ConfigParams parameters) throws ManifoldCFException, IOException {
+		final Map<String, Object> velocityContext = new HashMap<>();
+		Messages.outputResourceWithVelocity(out, locale, VIEW_CONFIGURATION_HTML, velocityContext);
+	}
+
+	protected static void fillInHtmlExtractorSpecification(final Map<String, Object> paramMap, final Specification os) {
+
+		final List<String> includeFilters = new ArrayList<String>();
+		final List<String> excludeFilters = new ArrayList<String>();
+
+
+
+
+		// Fill in context
+
+
+		for (int i = 0; i < os.getChildCount(); i++) {
+			final SpecificationNode sn = os.getChild(i);
+			if (sn.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
+				final String includeFilter = sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
+				if (includeFilter != null) {
+					includeFilters.add(includeFilter);
+				}
+			} else if (sn.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
+				final String excludeFilter = sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
+				if (excludeFilter != null) {
+					excludeFilters.add(excludeFilter);
+				}
+			}
+
+
+		}
+
+		paramMap.put("INCLUDEFILTERS", includeFilters);
+		paramMap.put("EXCLUDEFILTERS", excludeFilters);
+	}
+
+	/**
+	 * Output the specification header section. This method is called in the head
+	 * section of a job page which has selected a pipeline connection of the
+	 * current type. Its purpose is to add the required tabs to the list, and to
+	 * output any javascript methods that might be needed by the job editing HTML.
+	 *
+	 * @param out
+	 *          is the output to which any HTML should be sent.
+	 * @param locale
+	 * @param os
+	 *          is the current pipeline specification for this connection.
+	 * @param connectionSequenceNumber
+	 *          is the unique number of this connection within the job.
+	 * @param tabsArray
+	 *          is an array of tab names. Add to this array any tab names that are
+	 *          specific to the connector.
+	 */
+	@Override
+	public void outputSpecificationHeader(final IHTTPOutput out, final Locale locale, final Specification os,
+			final int connectionSequenceNumber, final List<String> tabsArray) throws ManifoldCFException, IOException {
+		final Map<String, Object> paramMap = new HashMap<>();
+		paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
+
+		tabsArray.add(Messages.getString(locale, "DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName"));
+
+		// Fill in the specification header map, using data from all tabs.
+		fillInHtmlExtractorSpecification(paramMap, os);
+
+		Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_JS, paramMap);
+	}
+
+	/**
+	 * Output the specification body section. This method is called in the body
+	 * section of a job page which has selected a pipeline connection of the
+	 * current type. Its purpose is to present the required form elements for
+	 * editing. The coder can presume that the HTML that is output from this
+	 * configuration will be within appropriate <html>, <body>, and <form> tags.
+	 * The name of the form is "editjob".
+	 *
+	 * @param out
+	 *          is the output to which any HTML should be sent.
+	 * @param locale
+	 *          is the preferred local of the output.
+	 * @param os
+	 *          is the current pipeline specification for this job.
+	 * @param connectionSequenceNumber
+	 *          is the unique number of this connection within the job.
+	 * @param actualSequenceNumber
+	 *          is the connection within the job that has currently been selected.
+	 * @param tabName
+	 *          is the current tab name.
+	 */
+	@Override
+	public void outputSpecificationBody(final IHTTPOutput out, final Locale locale, final Specification os,
+			final int connectionSequenceNumber, final int actualSequenceNumber, final String tabName)
+					throws ManifoldCFException, IOException {
+		final Map<String, Object> paramMap = new HashMap<>();
+
+		// Set the tab name
+		paramMap.put("TABNAME", tabName);
+		paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
+		paramMap.put("SELECTEDNUM", Integer.toString(actualSequenceNumber));
+
+		// Fill in the field mapping tab data
+		fillInHtmlExtractorSpecification(paramMap, os);
+
+		Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_HTML_EXTRACTOR_HTML, paramMap);
+	}
+
+	/**
+	 * Process a specification post. This method is called at the start of job's
+	 * edit or view page, whenever there is a possibility that form data for a
+	 * connection has been posted. Its purpose is to gather form information and
+	 * modify the transformation specification accordingly. The name of the posted
+	 * form is "editjob".
+	 *
+	 * @param variableContext
+	 *          contains the post data, including binary file-upload information.
+	 * @param locale
+	 *          is the preferred local of the output.
+	 * @param os
+	 *          is the current pipeline specification for this job.
+	 * @param connectionSequenceNumber
+	 *          is the unique number of this connection within the job.
+	 * @return null if all is well, or a string error message if there is an error
+	 *         that should prevent saving of the job (and cause a redirection to
+	 *         an error page).
+	 */
+	@Override
+	public String processSpecificationPost(final IPostParameters variableContext, final Locale locale,
+			final Specification os, final int connectionSequenceNumber) throws ManifoldCFException {
+
+		final String seqPrefix = "s" + connectionSequenceNumber + "_";
+
+		String x;
+
+		// Include filters
+		x = variableContext.getParameter(seqPrefix + "includefilter_count");
+		if (x != null && x.length() > 0) {
+			// About to gather the includefilter nodes, so get rid of the old ones.
+			int i = 0;
+			while (i < os.getChildCount()) {
+				final SpecificationNode node = os.getChild(i);
+				if (node.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
+					os.removeChild(i);
+				} else {
+					i++;
+				}
+			}
+			final int count = Integer.parseInt(x);
+			i = 0;
+			while (i < count) {
+				final String prefix = seqPrefix + "includefilter_";
+				final String suffix = "_" + Integer.toString(i);
+				final String op = variableContext.getParameter(prefix + "op" + suffix);
+				if (op == null || !op.equals("Delete")) {
+					// Gather the includefilters etc.
+					final String regex = variableContext.getParameter(prefix + HtmlExtractorConfig.ATTRIBUTE_REGEX + suffix);
+					final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_INCLUDEFILTER);
+					node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
+					os.addChild(os.getChildCount(), node);
+				}
+				i++;
+			}
+
+			final String addop = variableContext.getParameter(seqPrefix + "includefilter_op");
+			if (addop != null && addop.equals("Add")) {
+				final String regex = variableContext.getParameter(seqPrefix + "includefilter_regex");
+				final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_INCLUDEFILTER);
+				node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
+				os.addChild(os.getChildCount(), node);
+			}
+		}
+
+		// Exclude filters
+		x = variableContext.getParameter(seqPrefix + "excludefilter_count");
+		if (x != null && x.length() > 0) {
+			// About to gather the excludefilter nodes, so get rid of the old ones.
+			int i = 0;
+			while (i < os.getChildCount()) {
+				final SpecificationNode node = os.getChild(i);
+				if (node.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
+					os.removeChild(i);
+				} else {
+					i++;
+				}
+			}
+			final int count = Integer.parseInt(x);
+			i = 0;
+			while (i < count) {
+				final String prefix = seqPrefix + "excludefilter_";
+				final String suffix = "_" + Integer.toString(i);
+				final String op = variableContext.getParameter(prefix + "op" + suffix);
+				if (op == null || !op.equals("Delete")) {
+					// Gather the excludefilters etc.
+					final String regex = variableContext.getParameter(prefix + HtmlExtractorConfig.ATTRIBUTE_REGEX + suffix);
+					final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_EXCLUDEFILTER);
+					node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
+					os.addChild(os.getChildCount(), node);
+				}
+				i++;
+			}
+
+			final String addop = variableContext.getParameter(seqPrefix + "excludefilter_op");
+			if (addop != null && addop.equals("Add")) {
+				final String regex = variableContext.getParameter(seqPrefix + "excludefilter_regex");
+				final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_EXCLUDEFILTER);
+				node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
+				os.addChild(os.getChildCount(), node);
+			}
+		}
+
+		return null;
+	}
+
+	/**
+	 * View specification. This method is called in the body section of a job's
+	 * view page. Its purpose is to present the pipeline specification information
+	 * to the user. The coder can presume that the HTML that is output from this
+	 * configuration will be within appropriate <html> and <body> tags.
+	 *
+	 * @param out
+	 *          is the output to which any HTML should be sent.
+	 * @param locale
+	 *          is the preferred local of the output.
+	 * @param connectionSequenceNumber
+	 *          is the unique number of this connection within the job.
+	 * @param os
+	 *          is the current pipeline specification for this job.
+	 */
+	@Override
+	public void viewSpecification(final IHTTPOutput out, final Locale locale, final Specification os,
+			final int connectionSequenceNumber) throws ManifoldCFException, IOException {
+		final Map<String, Object> paramMap = new HashMap<>();
+		paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
+
+		// Fill in the map with data from all tabs
+		fillInHtmlExtractorSpecification(paramMap, os);
+
+		Messages.outputResourceWithVelocity(out, locale, VIEW_SPECIFICATION_HTML, paramMap);
+
+	}
+	protected static class SpecPacker {
+
+		private final List<String> includeFilters = new ArrayList<>();
+		private final List<String> excludeFilters = new ArrayList<>();
+
+
+		public SpecPacker(final Specification os) {
+			for (int i = 0; i < os.getChildCount(); i++) {
+				final SpecificationNode sn = os.getChild(i);
+
+				if (sn.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
+					final String regex = sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
+					includeFilters.add(regex);
+				}
+
+				if (sn.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
+					final String regex = sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
+					excludeFilters.add(regex);
+				}
+
+
+			}
+
+			if (includeFilters.isEmpty()) {
+				includeFilters.add(HtmlExtractorConfig.WHITELIST_DEFAULT);
+			}
+		}
+
+		public String toPackedString() {
+			final StringBuilder sb = new StringBuilder();
+
+			packList(sb, includeFilters, '+');
+			packList(sb, excludeFilters, '+');
+
+			return sb.toString();
+		}
+
+	}
+}
+
+

Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java Fri Mar 16 16:31:26 2018
@@ -0,0 +1,41 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.francelabs.datafari.htmlextractor;
+
+/**
+ * Parameters for Tika transformation connector.
+ */
+public class HtmlExtractorConfig {
+
+  // Configuration parameters
+	//TODO : remove the SOlr parameters
+  public static final String PARAM_SOLRUPDATEHANDLER = "solrUpdateHandler";
+  public static final String SOLRUPDATEHANDLER_DEFAULT = "/update/no-tika";
+  public static final String WHITELIST_DEFAULT = "body";
+  public static final String BLACKLIST_DEFAULT = "";
+  
+  // Specification nodes and values
+  public static final String NODE_INCLUDEFILTER = "includefilter";
+  public static final String NODE_EXCLUDEFILTER = "excludefilter";
+  public static final String INCLUDEFILTER_DEFAULT = "body";
+  public static final String ATTRIBUTE_REGEX = "regex";
+  public static final String ATTRIBUTE_VALUE = "value";
+
+}

Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java Fri Mar 16 16:31:26 2018
@@ -0,0 +1,176 @@
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.francelabs.datafari.htmlextractor;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.List;
+
+import org.apache.manifoldcf.core.system.Logging;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+public class JsoupProcessing {
+
+
+
+
+	public static Hashtable<String,String> extractTextAndMetadataHtmlDocument(InputStream streamDoc,String whitelist,List<String> blacklist) throws IOException{
+		Document doc = Jsoup.parse(streamDoc, "UTF-8", "");
+		Hashtable<String,String> metadata = new Hashtable<String,String>();
+		for(Element meta : doc.select("meta")) {
+			metadata.put(meta.attr("name"), meta.attr("content"));
+		}
+
+
+		if (doc.select("title") != null){
+			String title = doc.select("title").text();
+			metadata.put("title", title);
+		}
+
+		Element element_keywords = doc.select("meta[name='keywords']").first();
+		if (element_keywords != null) {
+			String keywords = (element_keywords.attr("content"));
+			metadata.put("keywords",keywords);
+		}
+
+		Element element_description = doc.select("meta[name=\"description\"]").first();
+		if (element_description != null) {
+			String description = (element_description.attr("content"));
+			metadata.put("description",description);
+		}
+
+		Element element_author = doc.select("meta[name=\"author\"]").first();
+		if (element_author != null) {
+			String author = (element_author.attr("content"));
+			metadata.put("author",author);
+		}
+
+
+		Element element_dcterms_subject = doc.select("meta[name=\"dcterms.subject\"]").first();
+		if (element_dcterms_subject != null) {
+			String dc_terms_subject = (element_dcterms_subject.attr("content"));
+			metadata.put("dc_terms_subject",dc_terms_subject);
+		}
+
+
+		Element element_dcterms_title = doc.select("meta[name=\"dcterms.title\"]").first();
+		if (element_dcterms_title != null) {
+			String dc_terms_title = (element_dcterms_title.attr("content"));
+			metadata.put("dc_terms_title",dc_terms_title);
+
+		}
+
+		Element element_dcterms_creator = doc.select("meta[name=\"dcterms.creator\"]").first();
+		if (element_dcterms_creator != null) {
+			String dc_terms_creator = (element_dcterms_creator.attr("content"));
+			metadata.put("dc_terms_creator",dc_terms_creator);
+
+		}
+
+		Element element_dcterms_description = doc.select("meta[name=\"dcterms.description\"]").first();
+		if (element_dcterms_description != null) {
+			String dc_terms_description = (element_dcterms_description.attr("content"));
+			metadata.put("dc_terms_description",dc_terms_description);
+
+		}
+
+		Element element_dcterms_publisher = doc.select("meta[name=\"dcterms.publisher\"]").first();
+		if (element_dcterms_publisher != null) {
+			String dc_terms_publisher = (element_dcterms_publisher.attr("content"));
+			metadata.put("dc_terms_publisher",dc_terms_publisher);
+
+		}
+
+		Element element_dcterms_contributor = doc.select("meta[name=\"dcterms.contributor\"]").first();
+		if (element_dcterms_contributor != null) {
+			String dc_terms_contributor = (element_dcterms_contributor.attr("content"));
+			metadata.put("dc_terms_contributor",dc_terms_contributor);
+
+		}
+
+		Element element_dcterms_date = doc.select("meta[name=\"dcterms.date\"]").first();
+		if (element_dcterms_date != null) {
+			String dc_terms_date = (element_dcterms_date.attr("content"));
+			metadata.put("dc_terms_date",dc_terms_date);
+
+		}
+
+		Element element_dcterms_type = doc.select("meta[name=\"dcterms.type\"]").first();
+		if (element_dcterms_type != null) {
+			String dc_terms_type = (element_dcterms_type.attr("content"));
+			metadata.put("dc_terms_type",dc_terms_type);
+
+		}
+
+		Element element_dcterms_format = doc.select("meta[name=\"dcterms.format\"]").first();
+		if (element_dcterms_format != null) {
+			String dc_terms_format = (element_dcterms_format.attr("content"));
+			metadata.put("dc_terms_format",dc_terms_format);
+
+		}
+
+		Element element_dcterms_language = doc.select("meta[name=\"dcterms.language\"]").first();
+		if (element_dcterms_language != null) {
+			String dc_terms_language = (element_dcterms_language.attr("content"));
+			metadata.put("dc_terms_language",dc_terms_language);
+
+		}
+
+		Element element_dcterms_identifier = doc.select("meta[name=\"dcterms.identifier\"]").first();
+		if (element_dcterms_identifier != null) {
+			String dc_terms_identifier = (element_dcterms_identifier.attr("content"));
+			metadata.put("dc_terms_identifier",dc_terms_identifier);
+		}
+
+
+		Element docToKeep = doc.body();
+		String finalDoc ;
+
+		// Englobing Tag
+		if (whitelist!="body"){
+			docToKeep = doc.select(whitelist).first();
+		}
+
+
+
+		// Blacklist
+		if (blacklist != null){
+			for (int i=0; i< blacklist.size();i++){
+				docToKeep.select(blacklist.get(i)).remove();
+			}
+		}
+
+		//finalDoc = docToKeep.text();
+		finalDoc = docToKeep.html();
+		metadata.put("extractedDoc",finalDoc);
+
+		return metadata;
+	}
+
+}
+
+

Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/Messages.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/Messages.java?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/Messages.java (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/Messages.java Fri Mar 16 16:31:26 2018
@@ -0,0 +1,125 @@
+/* $Id: Messages.java 1596720 2014-05-22 00:57:29Z kwright $ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package com.francelabs.datafari.htmlextractor;
+
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.manifoldcf.core.interfaces.IHTTPOutput;
+import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
+
+public class Messages extends org.apache.manifoldcf.ui.i18n.Messages {
+  public static final String DEFAULT_BUNDLE_NAME = "com.francelabs.datafari.htmlextractor.common";
+  public static final String DEFAULT_PATH_NAME = "com.francelabs.datafari.htmlextractor";
+
+  /**
+   * Constructor - do no instantiate
+   */
+  protected Messages() {
+  }
+
+  public static String getString(final Locale locale, final String messageKey) {
+    return getString(DEFAULT_BUNDLE_NAME, locale, messageKey, null);
+  }
+
+  public static String getAttributeString(final Locale locale, final String messageKey) {
+    return getAttributeString(DEFAULT_BUNDLE_NAME, locale, messageKey, null);
+  }
+
+  public static String getBodyString(final Locale locale, final String messageKey) {
+    return getBodyString(DEFAULT_BUNDLE_NAME, locale, messageKey, null);
+  }
+
+  public static String getAttributeJavascriptString(final Locale locale, final String messageKey) {
+    return getAttributeJavascriptString(DEFAULT_BUNDLE_NAME, locale, messageKey, null);
+  }
+
+  public static String getBodyJavascriptString(final Locale locale, final String messageKey) {
+    return getBodyJavascriptString(DEFAULT_BUNDLE_NAME, locale, messageKey, null);
+  }
+
+  public static String getString(final Locale locale, final String messageKey, final Object[] args) {
+    return getString(DEFAULT_BUNDLE_NAME, locale, messageKey, args);
+  }
+
+  public static String getAttributeString(final Locale locale, final String messageKey, final Object[] args) {
+    return getAttributeString(DEFAULT_BUNDLE_NAME, locale, messageKey, args);
+  }
+
+  public static String getBodyString(final Locale locale, final String messageKey, final Object[] args) {
+    return getBodyString(DEFAULT_BUNDLE_NAME, locale, messageKey, args);
+  }
+
+  public static String getAttributeJavascriptString(final Locale locale, final String messageKey, final Object[] args) {
+    return getAttributeJavascriptString(DEFAULT_BUNDLE_NAME, locale, messageKey, args);
+  }
+
+  public static String getBodyJavascriptString(final Locale locale, final String messageKey, final Object[] args) {
+    return getBodyJavascriptString(DEFAULT_BUNDLE_NAME, locale, messageKey, args);
+  }
+
+  // More general methods which allow bundlenames and class loaders to be
+  // specified.
+
+  public static String getString(final String bundleName, final Locale locale, final String messageKey,
+      final Object[] args) {
+    return getString(Messages.class, bundleName, locale, messageKey, args);
+  }
+
+  public static String getAttributeString(final String bundleName, final Locale locale, final String messageKey,
+      final Object[] args) {
+    return getAttributeString(Messages.class, bundleName, locale, messageKey, args);
+  }
+
+  public static String getBodyString(final String bundleName, final Locale locale, final String messageKey,
+      final Object[] args) {
+    return getBodyString(Messages.class, bundleName, locale, messageKey, args);
+  }
+
+  public static String getAttributeJavascriptString(final String bundleName, final Locale locale,
+      final String messageKey, final Object[] args) {
+    return getAttributeJavascriptString(Messages.class, bundleName, locale, messageKey, args);
+  }
+
+  public static String getBodyJavascriptString(final String bundleName, final Locale locale, final String messageKey,
+      final Object[] args) {
+    return getBodyJavascriptString(Messages.class, bundleName, locale, messageKey, args);
+  }
+
+  // Resource output
+
+  public static void outputResource(final IHTTPOutput output, final Locale locale, final String resourceKey,
+      final Map<String, String> substitutionParameters, final boolean mapToUpperCase) throws ManifoldCFException {
+    outputResource(output, Messages.class, DEFAULT_PATH_NAME, locale, resourceKey, substitutionParameters,
+        mapToUpperCase);
+  }
+
+  public static void outputResourceWithVelocity(final IHTTPOutput output, final Locale locale, final String resourceKey,
+      final Map<String, String> substitutionParameters, final boolean mapToUpperCase) throws ManifoldCFException {
+    outputResourceWithVelocity(output, Messages.class, DEFAULT_BUNDLE_NAME, DEFAULT_PATH_NAME, locale, resourceKey,
+        substitutionParameters, mapToUpperCase);
+  }
+
+  public static void outputResourceWithVelocity(final IHTTPOutput output, final Locale locale, final String resourceKey,
+      final Map<String, Object> contextObjects) throws ManifoldCFException {
+    outputResourceWithVelocity(output, Messages.class, DEFAULT_BUNDLE_NAME, DEFAULT_PATH_NAME, locale, resourceKey,
+        contextObjects);
+  }
+
+}

Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/HtmlExtractorException.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/HtmlExtractorException.java?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/HtmlExtractorException.java (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/HtmlExtractorException.java Fri Mar 16 16:31:26 2018
@@ -0,0 +1,19 @@
+package com.francelabs.datafari.htmlextractor.exception;
+
+
+public class HtmlExtractorException extends Exception {
+
+  /**
+   *
+   */
+  private static final long serialVersionUID = 1L;
+
+  public HtmlExtractorException(final String message) {
+    super(message);
+  }
+
+  public HtmlExtractorException(final String message, final Exception e) {
+    super(message, e);
+  }
+
+}

Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/RegexException.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/RegexException.java?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/RegexException.java (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/RegexException.java Fri Mar 16 16:31:26 2018
@@ -0,0 +1,27 @@
+package com.francelabs.datafari.htmlextractor.exception;
+
+
+public class RegexException extends Exception {
+
+  private String regex = "";
+
+  /**
+   *
+   */
+  private static final long serialVersionUID = 1L;
+
+  public RegexException(final String regex, final String message) {
+    super(message);
+    this.regex = regex;
+  }
+
+  public RegexException(final String regex, final String message, final Exception e) {
+    super(message, e);
+    this.regex = regex;
+  }
+
+  public String getRegex() {
+    return regex;
+  }
+
+}

Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties Fri Mar 16 16:31:26 2018
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName=HTML Extractor
+DatafariHtmlExtractorTransformationConnector.SolrUpdateHandler=Update handler:
+DatafariHtmlExtractorTransformationConnector.IncludeFilters=Englobing tag :
+DatafariHtmlExtractorTransformationConnector.ExcludeFilters=Tags to remove :
+DatafariHtmlExtractorTransformationConnector.RegularExpression=CSS selector
+DatafariHtmlExtractorTransformationConnector.Delete=Delete
+DatafariHtmlExtractorTransformationConnector.DeleteIncludeFilter=Delete englobing tag
+DatafariHtmlExtractorTransformationConnector.DeleteExcludeFilter=Delete blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified=No englobing tag specified
+DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified=No blacklist tag specified
+DatafariHtmlExtractorTransformationConnector.Add=Add
+DatafariHtmlExtractorTransformationConnector.AddIncludeFilter=Add englobing tag
+DatafariHtmlExtractorTransformationConnector.AddExcludeFilter=Add blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoRegexSpecified=No CSS selector specified

Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties Fri Mar 16 16:31:26 2018
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName=HTML Extractor
+DatafariHtmlExtractorTransformationConnector.SolrUpdateHandler=Update handler:
+DatafariHtmlExtractorTransformationConnector.IncludeFilters=Englobing tag :
+DatafariHtmlExtractorTransformationConnector.ExcludeFilters=Tags to remove :
+DatafariHtmlExtractorTransformationConnector.RegularExpression=CSS selector
+DatafariHtmlExtractorTransformationConnector.Delete=Delete
+DatafariHtmlExtractorTransformationConnector.DeleteIncludeFilter=Delete englobing tag
+DatafariHtmlExtractorTransformationConnector.DeleteExcludeFilter=Delete blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified=No englobing tag specified
+DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified=No blacklist tag specified
+DatafariHtmlExtractorTransformationConnector.Add=Add
+DatafariHtmlExtractorTransformationConnector.AddIncludeFilter=Add englobing tag
+DatafariHtmlExtractorTransformationConnector.AddExcludeFilter=Add blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoRegexSpecified=No CSS selector specified

Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties Fri Mar 16 16:31:26 2018
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName=HTML Extractor
+DatafariHtmlExtractorTransformationConnector.SolrUpdateHandler=Update handler:
+DatafariHtmlExtractorTransformationConnector.IncludeFilters=Englobing tag :
+DatafariHtmlExtractorTransformationConnector.ExcludeFilters=Tags to remove :
+DatafariHtmlExtractorTransformationConnector.RegularExpression=CSS selector
+DatafariHtmlExtractorTransformationConnector.Delete=Delete
+DatafariHtmlExtractorTransformationConnector.DeleteIncludeFilter=Delete englobing tag
+DatafariHtmlExtractorTransformationConnector.DeleteExcludeFilter=Delete blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified=No englobing tag specified
+DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified=No blacklist tag specified
+DatafariHtmlExtractorTransformationConnector.Add=Add
+DatafariHtmlExtractorTransformationConnector.AddIncludeFilter=Add englobing tag
+DatafariHtmlExtractorTransformationConnector.AddExcludeFilter=Add blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoRegexSpecified=No CSS selector specified

Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties Fri Mar 16 16:31:26 2018
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName=HTML Extractor
+DatafariHtmlExtractorTransformationConnector.SolrUpdateHandler=Update handler:
+DatafariHtmlExtractorTransformationConnector.IncludeFilters=Englobing tag :
+DatafariHtmlExtractorTransformationConnector.ExcludeFilters=Tags to remove :
+DatafariHtmlExtractorTransformationConnector.RegularExpression=CSS selector
+DatafariHtmlExtractorTransformationConnector.Delete=Delete
+DatafariHtmlExtractorTransformationConnector.DeleteIncludeFilter=Delete englobing tag
+DatafariHtmlExtractorTransformationConnector.DeleteExcludeFilter=Delete blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified=No englobing tag specified
+DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified=No blacklist tag specified
+DatafariHtmlExtractorTransformationConnector.Add=Add
+DatafariHtmlExtractorTransformationConnector.AddIncludeFilter=Add englobing tag
+DatafariHtmlExtractorTransformationConnector.AddExcludeFilter=Add blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoRegexSpecified=No CSS selector specified

Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editConfiguration.js
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editConfiguration.js?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editConfiguration.js (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editConfiguration.js Fri Mar 16 16:31:26 2018
@@ -0,0 +1,27 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<script type="text/javascript">
+<!--
+
+function checkConfig()
+{
+  return true;
+}
+
+//-->
+</script>

Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification.js
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification.js?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification.js (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification.js Fri Mar 16 16:31:26 2018
@@ -0,0 +1,76 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<script type="text/javascript">
+<!--
+function s${SEQNUM}_checkSpecification()
+{
+  return true;
+}
+
+function s${SEQNUM}_addIncludeFilter()
+{
+  if (editjob.s${SEQNUM}_includefilter_regex.value == "")
+  {
+    alert("$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoRegexSpecified'))");
+    editjob.s${SEQNUM}_includefilter_regex.focus();
+    return;
+  }
+  editjob.s${SEQNUM}_includefilter_op.value="Add";
+  postFormSetAnchor("s${SEQNUM}_includefilter");
+}
+
+function s${SEQNUM}_addExcludeFilter()
+{
+  if (editjob.s${SEQNUM}_excludefilter_regex.value == "")
+  {
+    alert("$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoRegexSpecified'))");
+    editjob.s${SEQNUM}_excludefilter_regex.focus();
+    return;
+  }
+  editjob.s${SEQNUM}_excludefilter_op.value="Add";
+  postFormSetAnchor("s${SEQNUM}_excludefilter");
+}
+
+function s${SEQNUM}_deleteIncludeFilter(i)
+{
+  // Set the operation
+  eval("editjob.s${SEQNUM}_includefilter_op_"+i+".value=\"Delete\"");
+  // Submit
+  if (editjob.s${SEQNUM}_includefilter_count.value==i)
+    postFormSetAnchor("s${SEQNUM}_includefilter");
+  else
+    postFormSetAnchor("s${SEQNUM}_includefilter_"+i)
+  // Undo, so we won't get two deletes next time
+  eval("editjob.s${SEQNUM}_includefilter_op_"+i+".value=\"Continue\"");
+}
+
+function s${SEQNUM}_deleteExcludeFilter(i)
+{
+  // Set the operation
+  eval("editjob.s${SEQNUM}_excludefilter_op_"+i+".value=\"Delete\"");
+  // Submit
+  if (editjob.s${SEQNUM}_excludefilter_count.value==i)
+    postFormSetAnchor("s${SEQNUM}_excludefilter");
+  else
+    postFormSetAnchor("s${SEQNUM}_excludefilter_"+i)
+  // Undo, so we won't get two deletes next time
+  eval("editjob.s${SEQNUM}_excludefilter_op_"+i+".value=\"Continue\"");
+}
+
+//-->
+</script>

Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html Fri Mar 16 16:31:26 2018
@@ -0,0 +1,148 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+#if($TABNAME == $ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName') && ${SEQNUM} == ${SELECTEDNUM})
+
+<table class="displaytable">
+<tr>
+    <td class="description">
+      <nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.IncludeFilters'))</nobr>
+    </td>
+    <td class="boxcell">
+      <table class="formtable">
+        <tr class="formheaderrow">
+          <td class="formcolumnheader"></td>
+          <td class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.RegularExpression'))</nobr></td>
+        </tr>
+
+  #set($includecounter = 0)
+  #foreach($includefilter in $INCLUDEFILTERS)
+  #if(($includecounter) < 2)
+    #set($includecounterdisplay = $includecounter + 1)
+    #if(($includecounter % 2) == 0)
+        <tr class="evenformrow">
+    #else
+        <tr class="oddformrow">
+    #end
+          <td class="formcolumncell">
+            <a name="s${SEQNUM}_includefilter_$includecounter">
+              <input type="button" value="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.Delete'))" alt="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.DeleteIncludeFilter'))$includecounterdisplay" onclick='javascript:s${SEQNUM}_deleteIncludeFilter("$includecounter");'/>
+              <input type="hidden" name="s${SEQNUM}_includefilter_op_$includecounter" value="Continue"/>
+            </a>
+          </td>
+          <td class="formcolumncell">
+            <nobr>$Encoder.bodyEscape($includefilter)</nobr>
+          </td>
+        </tr>
+    #set($includecounter = $includecounter + 1)
+  #end
+  #end
+  
+  
+  #if($includecounter == 0)
+        <tr class="formrow"><td class="formmessage" colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified'))</td></tr>
+  #end
+      
+        <tr class="formrow"><td class="formseparator" colspan="3"><hr/></td></tr>
+        <tr class="formrow">
+          <td class="formcolumncell">
+            <a name="includefilter">
+              <input type="button" value="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.Add'))" alt="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.AddIncludeFilter'))" onclick="javascript:s${SEQNUM}_addIncludeFilter();"/>
+            </a>
+            <input type="hidden" name="s${SEQNUM}_includefilter_count" value="$includecounter"/>
+            <input type="hidden" name="s${SEQNUM}_includefilter_op" value="Continue"/>
+          </td>
+          <td class="formcolumncell">
+            <nobr><input type="text" size="15" name="s${SEQNUM}_includefilter_regex" value=""/></nobr>
+          </td>
+        </tr>
+      </table>
+    </td>
+  </tr>
+  <tr>
+    <td class="description">
+      <nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.ExcludeFilters'))</nobr>
+    </td>
+    <td class="boxcell">
+      <table class="formtable">
+        <tr class="formheaderrow">
+          <td class="formcolumnheader"></td>
+          <td class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.RegularExpression'))</nobr></td>
+        </tr>
+
+  #set($excludecounter = 0)
+  #foreach($excludefilter in $EXCLUDEFILTERS)
+    #set($excludecounterdisplay = $excludecounter + 1)
+    #if(($excludecounter % 2) == 0)
+        <tr class="evenformrow">
+    #else
+        <tr class="oddformrow">
+    #end
+          <td class="formcolumncell">
+            <a name="s${SEQNUM}_excludefilter_$excludecounter">
+              <input type="button" value="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.Delete'))" alt="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.DeleteExcludeFilter'))$excludecounterdisplay" onclick='javascript:s${SEQNUM}_deleteExcludeFilter("$excludecounter");'/>
+              <input type="hidden" name="s${SEQNUM}_excludefilter_op_$excludecounter" value="Continue"/>
+              <input type="hidden" name="s${SEQNUM}_excludefilter_regex_$excludecounter" value="$Encoder.attributeEscape($excludefilter)"/>
+            </a>
+          </td>
+          <td class="formcolumncell">
+            <nobr>$Encoder.bodyEscape($excludefilter)</nobr>
+          </td>
+        </tr>
+    #set($excludecounter = $excludecounter + 1)
+  #end
+  
+  #if($excludecounter == 0)
+        <tr class="formrow"><td class="formmessage" colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified'))</td></tr>
+  #end
+      
+        <tr class="formrow"><td class="formseparator" colspan="3"><hr/></td></tr>
+        <tr class="formrow">
+          <td class="formcolumncell">
+            <a name="excludefilter">
+              <input type="button" value="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.Add'))" alt="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.AddExcludeFilter'))" onclick="javascript:s${SEQNUM}_addExcludeFilter();"/>
+            </a>
+            <input type="hidden" name="s${SEQNUM}_excludefilter_count" value="$excludecounter"/>
+            <input type="hidden" name="s${SEQNUM}_excludefilter_op" value="Continue"/>
+          </td>
+          <td class="formcolumncell">
+            <nobr><input type="text" size="15" name="s${SEQNUM}_excludefilter_regex" value=""/></nobr>
+          </td>
+        </tr>
+      </table>
+    </td>
+  </tr>
+</table>
+
+#else
+
+  #set($includecounter = 0)
+  #foreach($includefilter in $INCLUDEFILTERS)
+<input type="hidden" name="s${SEQNUM}_includefilter_regex_$includecounter" value="$Encoder.attributeEscape($includefilter)"/>
+    #set($includecounter = $includecounter + 1)
+  #end
+<input type="hidden" name="s${SEQNUM}_includefilter_count" value="$includecounter"/>
+
+  #set($excludecounter = 0)
+  #foreach($excludefilter in $EXCLUDEFILTERS)
+<input type="hidden" name="s${SEQNUM}_excludefilter_regex_$excludecounter" value="$Encoder.attributeEscape($excludefilter)"/>
+    #set($excludecounter = $excludecounter + 1)
+  #end
+<input type="hidden" name="s${SEQNUM}_excludefilter_count" value="$excludecounter"/>
+
+
+#end

Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewConfiguration.html
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewConfiguration.html?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewConfiguration.html (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewConfiguration.html Fri Mar 16 16:31:26 2018
@@ -0,0 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<table class="displaytable">
+  
+</table>

Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html Fri Mar 16 16:31:26 2018
@@ -0,0 +1,82 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+<table class="displaytable">
+ <tr>
+ 
+  <tr>
+    <td class="description">
+      <nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName')) $Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.IncludeFilters'))</nobr>
+    </td>
+    <td class="boxcell">
+      <table class="formtable">
+        <tr class="formheaderrow">
+          <td class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.RegularExpression'))</nobr></td>
+        </tr>
+
+  #set($includecounter = 0)
+  #foreach($includefilter in $INCLUDEFILTERS)
+    #if(($includecounter % 2) == 0)
+        <tr class="evenformrow">
+    #else
+        <tr class="oddformrow">
+    #end
+          <td class="formcolumncell">
+            <nobr>$Encoder.bodyEscape($includefilter)</nobr>
+          </td>
+        </tr>
+    #set($includecounter = $includecounter + 1)
+  #end
+  
+  #if($includecounter == 0)
+        <tr class="formrow"><td class="formmessage" colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified'))</td></tr>
+  #end
+      </table>
+    </td>
+  </tr>
+  <tr>
+    <td class="description">
+      <nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName')) $Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.ExcludeFilters'))</nobr>
+    </td>
+    <td class="boxcell">
+      <table class="formtable">
+        <tr class="formheaderrow">
+          <td class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.RegularExpression'))</nobr></td>
+        </tr>
+
+  #set($excludecounter = 0)
+  #foreach($excludefilter in $EXCLUDEFILTERS)
+    #if(($excludecounter % 2) == 0)
+        <tr class="evenformrow">
+    #else
+        <tr class="oddformrow">
+    #end
+          <td class="formcolumncell">
+            <nobr>$Encoder.bodyEscape($excludefilter)</nobr>
+          </td>
+        </tr>
+    #set($excludecounter = $excludecounter + 1)
+  #end
+  
+  #if($excludecounter == 0)
+        <tr class="formrow"><td class="formmessage" colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified'))</td></tr>
+  #end
+      </table>
+    </td>
+  </tr>
+</table>