You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2018/03/16 16:31:27 UTC
svn commit: r1827009 - in
/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector:
./ connector/ connector/src/ connector/src/main/ connector/src/main/java/
connector/src/main/java/com/ connector/src/main/java/com/francelabs/ ...
Author: kwright
Date: Fri Mar 16 16:31:26 2018
New Revision: 1827009
URL: http://svn.apache.org/viewvc?rev=1827009&view=rev
Log:
Commit initial contribution (with path changes)
Added:
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/build.xml
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/com/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/com/francelabs/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/com/francelabs/datafari/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/Messages.java
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/HtmlExtractorException.java
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/RegexException.java
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/com/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/com/francelabs/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/com/francelabs/datafari/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/com/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/com/francelabs/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/com/francelabs/datafari/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editConfiguration.js
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification.js
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewConfiguration.html
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html
Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/build.xml
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/build.xml?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/build.xml (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/build.xml Fri Mar 16 16:31:26 2018
@@ -0,0 +1,59 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project name="html" default="all">
+
+ <property environment="env"/>
+ <condition property="mcf-dist" value="${env.MCFDISTPATH}">
+ <isset property="env.MCFDISTPATH"/>
+ </condition>
+ <property name="abs-dist" location="../../dist"/>
+ <condition property="mcf-dist" value="${abs-dist}">
+ <not>
+ <isset property="env.MCFDISTPATH"/>
+ </not>
+ </condition>
+
+ <import file="${mcf-dist}/connector-build.xml"/>
+
+ <path id="connector-classpath">
+ <path refid="mcf-connector-build.connector-classpath"/>
+ <fileset dir="../../lib">
+ <include name="jsoup*.jar"/>
+
+ </fileset>
+ </path>
+
+ <target name="lib" depends="mcf-connector-build.lib,precompile-check" if="canBuild">
+ <mkdir dir="dist/lib"/>
+ <copy todir="dist/lib">
+ <fileset dir="../../lib">
+ <include name="jsoup*.jar"/>
+ </fileset>
+ </copy>
+ </target>
+
+ <target name="deliver-connector" depends="mcf-connector-build.deliver-connector">
+ <antcall target="general-add-transformation-connector">
+ <param name="connector-label" value="Html extractor"/>
+ <param name="connector-class" value="org.apache.manifoldcf.agents.transformers.htmlextractor.HtmlExtractor"/>
+ </antcall>
+ </target>
+
+</project>
+
+
Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java Fri Mar 16 16:31:26 2018
@@ -0,0 +1,753 @@
+package com.francelabs.datafari.htmlextractor;
+
+/* $Id$ */
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.system.Logging;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+
+
+import com.francelabs.datafari.htmlextractor.exception.RegexException;
+
+import org.apache.manifoldcf.agents.interfaces.*;
+
+import java.io.*;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+import java.util.Map.Entry;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+/** This connector works as a transformation connector, but does nothing other than logging.
+ *
+ */
+public class HtmlExtractor extends org.apache.manifoldcf.agents.transformation.BaseTransformationConnector
+{
+
+ public static final String _rcsid = "@(#)$Id$";
+
+ protected static final String ACTIVITY_PROCESS = "process";
+
+ protected static final String[] activitiesList = new String[]{ACTIVITY_PROCESS};
+
+ /**
+ * Forward to the javascript to check the specification parameters for the job
+ */
+ private static final String EDIT_CONFIGURATION_JS = "editConfiguration.js";
+
+ private static final String VIEW_CONFIGURATION_HTML = "viewConfiguration.html";
+ private static final String EDIT_SPECIFICATION_JS = "editSpecification.js";
+ private static final String VIEW_SPECIFICATION_HTML = "viewSpecification.html";
+ private static final String EDIT_SPECIFICATION_HTML_EXTRACTOR_HTML = "editSpecification_HTML_Extractor.html";
+
+
+ /** We handle up to 64K in memory; after that we go to disk. */
+ protected static final long inMemoryMaximumFile = 65536;
+
+ /** Return a list of activities that this connector generates.
+ * The connector does NOT need to be connected before this method is called.
+ *@return the set of activities.
+ */
+ @Override
+ public String[] getActivitiesList()
+ {
+ return activitiesList;
+ }
+
+ /** Add (or replace) a document in the output data store using the connector.
+ * This method presumes that the connector object has been configured, and it is thus able to communicate with the output data store should that be
+ * necessary.
+ * The OutputSpecification is *not* provided to this method, because the goal is consistency, and if output is done it must be consistent with the
+ * output description, since that was what was partly used to determine if output should be taking place. So it may be necessary for this method to decode
+ * an output description string in order to determine what should be done.
+ *@param documentURI is the URI of the document. The URI is presumed to be the unique identifier which the output data store will use to process
+ * and serve the document. This URI is constructed by the repository connector which fetches the document, and is thus universal across all output connectors.
+ *@param outputDescription is the description string that was constructed for this document by the getOutputDescription() method.
+ *@param document is the document data to be processed (handed to the output data store).
+ *@param authorityNameString is the name of the authority responsible for authorizing any access tokens passed in with the repository document. May be null.
+ *@param activities is the handle to an object that the implementer of a pipeline connector may use to perform operations, such as logging processing activity,
+ * or sending a modified document to the next stage in the pipeline.
+ *@return the document status (accepted or permanently rejected).
+ *@throws IOException only if there's a stream error reading the document data.
+ */
+ @Override
+ public int addOrReplaceDocumentWithException(String documentURI, VersionContext pipelineDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
+ throws ManifoldCFException, ServiceInterruption, IOException
+ {
+ long startTime = System.currentTimeMillis();
+ String resultCode = "OK";
+ String description = null;
+ Long length = null;
+
+ final SpecPacker sp = new SpecPacker(pipelineDescription.getSpecification());
+
+
+ Logging.root.info("Processing by HTML Extractor");
+ if (!(document.getMimeType().startsWith("text/html")) || (document.getMimeType().startsWith("application/xhtml+xml"))){
+ Logging.root.warn("no processing, mime type not html");
+ resultCode = "NO HTML";
+
+ }
+
+ else {
+ try
+ {
+ Logging.root.info("Document recognized as HTML - processing");
+ long binaryLength = document.getBinaryLength();
+
+
+ length = new Long(binaryLength);
+
+ /*
+
+ DestinationStorage ds;
+
+ if (document.getBinaryLength() <= inMemoryMaximumFile)
+ {
+ ds = new MemoryDestinationStorage((int)document.getBinaryLength());
+ }
+ else
+ {
+ ds = new FileDestinationStorage();
+ }
+ try
+ {
+ OutputStream os = ds.getOutputStream();
+ */
+
+
+ //TODO
+ /* Add an option to keep HTML markup of the extracted text or not -
+ * in case for example of processing by Tika after this transformation connector
+ *
+ */
+ Hashtable<String,String> metadataExtracted = new Hashtable<String,String>();
+ metadataExtracted = JsoupProcessing.extractTextAndMetadataHtmlDocument(document.getBinaryStream(),sp.includeFilters.get(0), sp.excludeFilters);
+ InputStream newStream = new ByteArrayInputStream(metadataExtracted.get("extractedDoc").getBytes(StandardCharsets.UTF_8));
+ int lenghtNewStream = newStream.available();
+ document.setBinary(newStream, lenghtNewStream);
+ Iterator<Entry<String, String>> it;
+ Map.Entry<String,String> entry;
+
+ it = metadataExtracted.entrySet().iterator();
+ while (it.hasNext()) {
+ entry = it.next();
+ if (entry.getKey()!="extractedDoc")
+ document.addField("jsoup_"+entry.getKey(), entry.getValue());
+
+ }
+
+ return activities.sendDocument(documentURI,document);
+ }
+ catch (ServiceInterruption e)
+ {
+ resultCode = "SERVICEINTERRUPTION";
+ description = e.getMessage();
+ throw e;
+ }
+ catch (ManifoldCFException e)
+ {
+ resultCode = "EXCEPTION";
+ description = e.getMessage();
+ throw e;
+ }
+ catch (IOException e)
+ {
+ resultCode = "IOEXCEPTION";
+ description = e.getMessage();
+ throw e;
+ }
+
+ catch (Exception e)
+ {
+
+ resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
+ description = e.getMessage();
+ }
+ finally
+ {
+ activities.recordActivity(new Long(startTime), ACTIVITY_PROCESS, length, documentURI,
+ resultCode, description);
+ }
+
+
+ }
+
+ return activities.sendDocument(documentURI,document);
+ }
+
+
+ protected static interface DestinationStorage
+ {
+ /** Get the output stream to write to. Caller should explicitly close this stream when done writing.
+ */
+ public OutputStream getOutputStream()
+ throws ManifoldCFException;
+
+ /** Get new binary length.
+ */
+ public long getBinaryLength()
+ throws ManifoldCFException;
+
+ /** Get the input stream to read from. Caller should explicitly close this stream when done reading.
+ */
+ public InputStream getInputStream()
+ throws ManifoldCFException;
+
+ /** Close the object and clean up everything.
+ * This should be called when the data is no longer needed.
+ */
+ public void close()
+ throws ManifoldCFException;
+ }
+
+ protected static class FileDestinationStorage implements DestinationStorage
+ {
+ protected final File outputFile;
+ protected final OutputStream outputStream;
+
+ public FileDestinationStorage()
+ throws ManifoldCFException
+ {
+ File outputFile;
+ OutputStream outputStream;
+ try
+ {
+ outputFile = File.createTempFile("mcftika","tmp");
+ outputStream = new FileOutputStream(outputFile);
+ }
+ catch (IOException e)
+ {
+ handleIOException(e);
+ outputFile = null;
+ outputStream = null;
+ }
+ this.outputFile = outputFile;
+ this.outputStream = outputStream;
+ }
+
+ @Override
+ public OutputStream getOutputStream()
+ throws ManifoldCFException
+ {
+ return outputStream;
+ }
+
+ /** Get new binary length.
+ */
+ @Override
+ public long getBinaryLength()
+ throws ManifoldCFException
+ {
+ return outputFile.length();
+ }
+
+ /** Get the input stream to read from. Caller should explicitly close this stream when done reading.
+ */
+ @Override
+ public InputStream getInputStream()
+ throws ManifoldCFException
+ {
+ try
+ {
+ return new FileInputStream(outputFile);
+ }
+ catch (IOException e)
+ {
+ handleIOException(e);
+ return null;
+ }
+ }
+
+ private void handleIOException(IOException e) {
+ // TODO Auto-generated method stub
+
+ }
+
+ /** Close the object and clean up everything.
+ * This should be called when the data is no longer needed.
+ */
+ @Override
+ public void close()
+ throws ManifoldCFException
+ {
+ outputFile.delete();
+ }
+
+ }
+
+ protected static class MemoryDestinationStorage implements DestinationStorage
+ {
+ protected final ByteArrayOutputStream outputStream;
+
+ public MemoryDestinationStorage(int sizeHint)
+ {
+ outputStream = new ByteArrayOutputStream(sizeHint);
+ }
+
+ @Override
+ public OutputStream getOutputStream()
+ throws ManifoldCFException
+ {
+ return outputStream;
+ }
+
+ /** Get new binary length.
+ */
+ @Override
+ public long getBinaryLength()
+ throws ManifoldCFException
+ {
+ return outputStream.size();
+ }
+
+ /** Get the input stream to read from. Caller should explicitly close this stream when done reading.
+ */
+ @Override
+ public InputStream getInputStream()
+ throws ManifoldCFException
+ {
+ return new ByteArrayInputStream(outputStream.toByteArray());
+ }
+
+ /** Close the object and clean up everything.
+ * This should be called when the data is no longer needed.
+ */
+ public void close()
+ throws ManifoldCFException
+ {
+ }
+ protected static int handleIOException(IOException e)
+ throws ManifoldCFException
+ {
+ // IOException reading from our local storage...
+ if (e instanceof InterruptedIOException)
+ throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ throw new ManifoldCFException(e.getMessage(),e);
+ }
+
+ }
+ /**
+ * Test if there is at least one regular expression that match with the
+ * provided sting
+ *
+ * @param regexList
+ * the list of regular expressions
+ * @param str
+ * the string to test
+ * @return the first matching regex found or null if no matching regex
+ */
+ private String matchingRegex(final List<String> regexList, final String str) throws RegexException {
+ for (final String regex : regexList) {
+ try {
+ final Pattern pattern = Pattern.compile(regex);
+ final Matcher matcher = pattern.matcher(str);
+ if (matcher.find()) {
+ return regex;
+ }
+ } catch (final PatternSyntaxException e) {
+ throw new RegexException(regex, "Invalid regular expression");
+ }
+ }
+ return null;
+ }
+
+
+
+
+
+
+
+ /**
+ * Output the configuration header section. This method is called in the head
+ * section of the connector's configuration page. Its purpose is to add the
+ * required tabs to the list, and to output any javascript methods that might
+ * be needed by the configuration editing HTML.
+ *
+ * @param threadContext
+ * is the local thread context.
+ * @param out
+ * is the output to which any HTML should be sent.
+ * @param parameters
+ * are the configuration parameters, as they currently exist, for
+ * this connection being configured.
+ * @param tabsArray
+ * is an array of tab names. Add to this array any tab names that are
+ * specific to the connector.
+ */
+ @Override
+ public void outputConfigurationHeader(final IThreadContext threadContext, final IHTTPOutput out, final Locale locale,
+ final ConfigParams parameters, final List<String> tabsArray) throws ManifoldCFException, IOException {
+
+ Messages.outputResourceWithVelocity(out, locale, EDIT_CONFIGURATION_JS, null);
+ }
+
+ /**
+ * Output the configuration body section. This method is called in the body
+ * section of the connector's configuration page. Its purpose is to present
+ * the required form elements for editing. The coder can presume that the HTML
+ * that is output from this configuration will be within appropriate <html>,
+ * <body>, and <form> tags. The name of the form is "editconnection".
+ *
+ * @param threadContext
+ * is the local thread context.
+ * @param out
+ * is the output to which any HTML should be sent.
+ * @param parameters
+ * are the configuration parameters, as they currently exist, for
+ * this connection being configured.
+ * @param tabName
+ * is the current tab name.
+ */
+ @Override
+ public void outputConfigurationBody(final IThreadContext threadContext, final IHTTPOutput out, final Locale locale,
+ final ConfigParams parameters, final String tabName) throws ManifoldCFException, IOException {
+ final Map<String, Object> velocityContext = new HashMap<>();
+ velocityContext.put("TabName", tabName);
+
+ }
+
+ /**
+ * Process a configuration post. This method is called at the start of the
+ * connector's configuration page, whenever there is a possibility that form
+ * data for a connection has been posted. Its purpose is to gather form
+ * information and modify the configuration parameters accordingly. The name
+ * of the posted form is "editconnection".
+ *
+ * @param threadContext
+ * is the local thread context.
+ * @param variableContext
+ * is the set of variables available from the post, including binary
+ * file post information.
+ * @param parameters
+ * are the configuration parameters, as they currently exist, for
+ * this connection being configured.
+ * @return null if all is well, or a string error message if there is an error
+ * that should prevent saving of the connection (and cause a
+ * redirection to an error page).
+ */
+ @Override
+ public String processConfigurationPost(final IThreadContext threadContext, final IPostParameters variableContext,
+ final Locale locale, final ConfigParams parameters) throws ManifoldCFException {
+
+
+ return null;
+ }
+
+ /**
+ * View configuration. This method is called in the body section of the
+ * connector's view configuration page. Its purpose is to present the
+ * connection information to the user. The coder can presume that the HTML
+ * that is output from this configuration will be within appropriate <html>
+ * and <body> tags.
+ *
+ * @param threadContext
+ * is the local thread context.
+ * @param out
+ * is the output to which any HTML should be sent.
+ * @param parameters
+ * are the configuration parameters, as they currently exist, for
+ * this connection being configured.
+ */
+ @Override
+ public void viewConfiguration(final IThreadContext threadContext, final IHTTPOutput out, final Locale locale,
+ final ConfigParams parameters) throws ManifoldCFException, IOException {
+ final Map<String, Object> velocityContext = new HashMap<>();
+ Messages.outputResourceWithVelocity(out, locale, VIEW_CONFIGURATION_HTML, velocityContext);
+ }
+
+ protected static void fillInHtmlExtractorSpecification(final Map<String, Object> paramMap, final Specification os) {
+
+ final List<String> includeFilters = new ArrayList<String>();
+ final List<String> excludeFilters = new ArrayList<String>();
+
+
+
+
+ // Fill in context
+
+
+ for (int i = 0; i < os.getChildCount(); i++) {
+ final SpecificationNode sn = os.getChild(i);
+ if (sn.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
+ final String includeFilter = sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
+ if (includeFilter != null) {
+ includeFilters.add(includeFilter);
+ }
+ } else if (sn.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
+ final String excludeFilter = sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
+ if (excludeFilter != null) {
+ excludeFilters.add(excludeFilter);
+ }
+ }
+
+
+ }
+
+ paramMap.put("INCLUDEFILTERS", includeFilters);
+ paramMap.put("EXCLUDEFILTERS", excludeFilters);
+ }
+
+ /**
+ * Output the specification header section. This method is called in the head
+ * section of a job page which has selected a pipeline connection of the
+ * current type. Its purpose is to add the required tabs to the list, and to
+ * output any javascript methods that might be needed by the job editing HTML.
+ *
+ * @param out
+ * is the output to which any HTML should be sent.
+ * @param locale
+ * @param os
+ * is the current pipeline specification for this connection.
+ * @param connectionSequenceNumber
+ * is the unique number of this connection within the job.
+ * @param tabsArray
+ * is an array of tab names. Add to this array any tab names that are
+ * specific to the connector.
+ */
+ @Override
+ public void outputSpecificationHeader(final IHTTPOutput out, final Locale locale, final Specification os,
+ final int connectionSequenceNumber, final List<String> tabsArray) throws ManifoldCFException, IOException {
+ final Map<String, Object> paramMap = new HashMap<>();
+ paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
+
+ tabsArray.add(Messages.getString(locale, "DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName"));
+
+ // Fill in the specification header map, using data from all tabs.
+ fillInHtmlExtractorSpecification(paramMap, os);
+
+ Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_JS, paramMap);
+ }
+
+ /**
+ * Output the specification body section. This method is called in the body
+ * section of a job page which has selected a pipeline connection of the
+ * current type. Its purpose is to present the required form elements for
+ * editing. The coder can presume that the HTML that is output from this
+ * configuration will be within appropriate <html>, <body>, and <form> tags.
+ * The name of the form is "editjob".
+ *
+ * @param out
+ * is the output to which any HTML should be sent.
+ * @param locale
+ * is the preferred local of the output.
+ * @param os
+ * is the current pipeline specification for this job.
+ * @param connectionSequenceNumber
+ * is the unique number of this connection within the job.
+ * @param actualSequenceNumber
+ * is the connection within the job that has currently been selected.
+ * @param tabName
+ * is the current tab name.
+ */
+ @Override
+ public void outputSpecificationBody(final IHTTPOutput out, final Locale locale, final Specification os,
+ final int connectionSequenceNumber, final int actualSequenceNumber, final String tabName)
+ throws ManifoldCFException, IOException {
+ final Map<String, Object> paramMap = new HashMap<>();
+
+ // Set the tab name
+ paramMap.put("TABNAME", tabName);
+ paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
+ paramMap.put("SELECTEDNUM", Integer.toString(actualSequenceNumber));
+
+ // Fill in the field mapping tab data
+ fillInHtmlExtractorSpecification(paramMap, os);
+
+ Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_HTML_EXTRACTOR_HTML, paramMap);
+ }
+
+ /**
+ * Process a specification post. This method is called at the start of job's
+ * edit or view page, whenever there is a possibility that form data for a
+ * connection has been posted. Its purpose is to gather form information and
+ * modify the transformation specification accordingly. The name of the posted
+ * form is "editjob".
+ *
+ * @param variableContext
+ * contains the post data, including binary file-upload information.
+ * @param locale
+ * is the preferred local of the output.
+ * @param os
+ * is the current pipeline specification for this job.
+ * @param connectionSequenceNumber
+ * is the unique number of this connection within the job.
+ * @return null if all is well, or a string error message if there is an error
+ * that should prevent saving of the job (and cause a redirection to
+ * an error page).
+ */
+ @Override
+ public String processSpecificationPost(final IPostParameters variableContext, final Locale locale,
+ final Specification os, final int connectionSequenceNumber) throws ManifoldCFException {
+
+ final String seqPrefix = "s" + connectionSequenceNumber + "_";
+
+ String x;
+
+ // Include filters
+ x = variableContext.getParameter(seqPrefix + "includefilter_count");
+ if (x != null && x.length() > 0) {
+ // About to gather the includefilter nodes, so get rid of the old ones.
+ int i = 0;
+ while (i < os.getChildCount()) {
+ final SpecificationNode node = os.getChild(i);
+ if (node.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
+ os.removeChild(i);
+ } else {
+ i++;
+ }
+ }
+ final int count = Integer.parseInt(x);
+ i = 0;
+ while (i < count) {
+ final String prefix = seqPrefix + "includefilter_";
+ final String suffix = "_" + Integer.toString(i);
+ final String op = variableContext.getParameter(prefix + "op" + suffix);
+ if (op == null || !op.equals("Delete")) {
+ // Gather the includefilters etc.
+ final String regex = variableContext.getParameter(prefix + HtmlExtractorConfig.ATTRIBUTE_REGEX + suffix);
+ final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_INCLUDEFILTER);
+ node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
+ os.addChild(os.getChildCount(), node);
+ }
+ i++;
+ }
+
+ final String addop = variableContext.getParameter(seqPrefix + "includefilter_op");
+ if (addop != null && addop.equals("Add")) {
+ final String regex = variableContext.getParameter(seqPrefix + "includefilter_regex");
+ final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_INCLUDEFILTER);
+ node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
+ os.addChild(os.getChildCount(), node);
+ }
+ }
+
+ // Exclude filters
+ x = variableContext.getParameter(seqPrefix + "excludefilter_count");
+ if (x != null && x.length() > 0) {
+ // About to gather the excludefilter nodes, so get rid of the old ones.
+ int i = 0;
+ while (i < os.getChildCount()) {
+ final SpecificationNode node = os.getChild(i);
+ if (node.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
+ os.removeChild(i);
+ } else {
+ i++;
+ }
+ }
+ final int count = Integer.parseInt(x);
+ i = 0;
+ while (i < count) {
+ final String prefix = seqPrefix + "excludefilter_";
+ final String suffix = "_" + Integer.toString(i);
+ final String op = variableContext.getParameter(prefix + "op" + suffix);
+ if (op == null || !op.equals("Delete")) {
+ // Gather the excludefilters etc.
+ final String regex = variableContext.getParameter(prefix + HtmlExtractorConfig.ATTRIBUTE_REGEX + suffix);
+ final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_EXCLUDEFILTER);
+ node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
+ os.addChild(os.getChildCount(), node);
+ }
+ i++;
+ }
+
+ final String addop = variableContext.getParameter(seqPrefix + "excludefilter_op");
+ if (addop != null && addop.equals("Add")) {
+ final String regex = variableContext.getParameter(seqPrefix + "excludefilter_regex");
+ final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_EXCLUDEFILTER);
+ node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
+ os.addChild(os.getChildCount(), node);
+ }
+ }
+
+ return null;
+ }
+
+ /**
+ * View specification. This method is called in the body section of a job's
+ * view page. Its purpose is to present the pipeline specification information
+ * to the user. The coder can presume that the HTML that is output from this
+ * configuration will be within appropriate <html> and <body> tags.
+ *
+ * @param out
+ * is the output to which any HTML should be sent.
+ * @param locale
+ * is the preferred local of the output.
+ * @param connectionSequenceNumber
+ * is the unique number of this connection within the job.
+ * @param os
+ * is the current pipeline specification for this job.
+ */
+ @Override
+ public void viewSpecification(final IHTTPOutput out, final Locale locale, final Specification os,
+ final int connectionSequenceNumber) throws ManifoldCFException, IOException {
+ final Map<String, Object> paramMap = new HashMap<>();
+ paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
+
+ // Fill in the map with data from all tabs
+ fillInHtmlExtractorSpecification(paramMap, os);
+
+ Messages.outputResourceWithVelocity(out, locale, VIEW_SPECIFICATION_HTML, paramMap);
+
+ }
+ protected static class SpecPacker {
+
+ private final List<String> includeFilters = new ArrayList<>();
+ private final List<String> excludeFilters = new ArrayList<>();
+
+
+ public SpecPacker(final Specification os) {
+ for (int i = 0; i < os.getChildCount(); i++) {
+ final SpecificationNode sn = os.getChild(i);
+
+ if (sn.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
+ final String regex = sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
+ includeFilters.add(regex);
+ }
+
+ if (sn.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
+ final String regex = sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
+ excludeFilters.add(regex);
+ }
+
+
+ }
+
+ if (includeFilters.isEmpty()) {
+ includeFilters.add(HtmlExtractorConfig.WHITELIST_DEFAULT);
+ }
+ }
+
+ public String toPackedString() {
+ final StringBuilder sb = new StringBuilder();
+
+ packList(sb, includeFilters, '+');
+ packList(sb, excludeFilters, '+');
+
+ return sb.toString();
+ }
+
+ }
+}
+
+
Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java Fri Mar 16 16:31:26 2018
@@ -0,0 +1,41 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.francelabs.datafari.htmlextractor;
+
+/**
+ * Parameters for Tika transformation connector.
+ */
+public class HtmlExtractorConfig {
+
+ // Configuration parameters
+ //TODO : remove the SOlr parameters
+ public static final String PARAM_SOLRUPDATEHANDLER = "solrUpdateHandler";
+ public static final String SOLRUPDATEHANDLER_DEFAULT = "/update/no-tika";
+ public static final String WHITELIST_DEFAULT = "body";
+ public static final String BLACKLIST_DEFAULT = "";
+
+ // Specification nodes and values
+ public static final String NODE_INCLUDEFILTER = "includefilter";
+ public static final String NODE_EXCLUDEFILTER = "excludefilter";
+ public static final String INCLUDEFILTER_DEFAULT = "body";
+ public static final String ATTRIBUTE_REGEX = "regex";
+ public static final String ATTRIBUTE_VALUE = "value";
+
+}
Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java Fri Mar 16 16:31:26 2018
@@ -0,0 +1,176 @@
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.francelabs.datafari.htmlextractor;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.List;
+
+import org.apache.manifoldcf.core.system.Logging;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+public class JsoupProcessing {
+
+
+
+
+ public static Hashtable<String,String> extractTextAndMetadataHtmlDocument(InputStream streamDoc,String whitelist,List<String> blacklist) throws IOException{
+ Document doc = Jsoup.parse(streamDoc, "UTF-8", "");
+ Hashtable<String,String> metadata = new Hashtable<String,String>();
+ for(Element meta : doc.select("meta")) {
+ metadata.put(meta.attr("name"), meta.attr("content"));
+ }
+
+
+ if (doc.select("title") != null){
+ String title = doc.select("title").text();
+ metadata.put("title", title);
+ }
+
+ Element element_keywords = doc.select("meta[name='keywords']").first();
+ if (element_keywords != null) {
+ String keywords = (element_keywords.attr("content"));
+ metadata.put("keywords",keywords);
+ }
+
+ Element element_description = doc.select("meta[name=\"description\"]").first();
+ if (element_description != null) {
+ String description = (element_description.attr("content"));
+ metadata.put("description",description);
+ }
+
+ Element element_author = doc.select("meta[name=\"author\"]").first();
+ if (element_author != null) {
+ String author = (element_author.attr("content"));
+ metadata.put("author",author);
+ }
+
+
+ Element element_dcterms_subject = doc.select("meta[name=\"dcterms.subject\"]").first();
+ if (element_dcterms_subject != null) {
+ String dc_terms_subject = (element_dcterms_subject.attr("content"));
+ metadata.put("dc_terms_subject",dc_terms_subject);
+ }
+
+
+ Element element_dcterms_title = doc.select("meta[name=\"dcterms.title\"]").first();
+ if (element_dcterms_title != null) {
+ String dc_terms_title = (element_dcterms_title.attr("content"));
+ metadata.put("dc_terms_title",dc_terms_title);
+
+ }
+
+ Element element_dcterms_creator = doc.select("meta[name=\"dcterms.creator\"]").first();
+ if (element_dcterms_creator != null) {
+ String dc_terms_creator = (element_dcterms_creator.attr("content"));
+ metadata.put("dc_terms_creator",dc_terms_creator);
+
+ }
+
+ Element element_dcterms_description = doc.select("meta[name=\"dcterms.description\"]").first();
+ if (element_dcterms_description != null) {
+ String dc_terms_description = (element_dcterms_description.attr("content"));
+ metadata.put("dc_terms_description",dc_terms_description);
+
+ }
+
+ Element element_dcterms_publisher = doc.select("meta[name=\"dcterms.publisher\"]").first();
+ if (element_dcterms_publisher != null) {
+ String dc_terms_publisher = (element_dcterms_publisher.attr("content"));
+ metadata.put("dc_terms_publisher",dc_terms_publisher);
+
+ }
+
+ Element element_dcterms_contributor = doc.select("meta[name=\"dcterms.contributor\"]").first();
+ if (element_dcterms_contributor != null) {
+ String dc_terms_contributor = (element_dcterms_contributor.attr("content"));
+ metadata.put("dc_terms_contributor",dc_terms_contributor);
+
+ }
+
+ Element element_dcterms_date = doc.select("meta[name=\"dcterms.date\"]").first();
+ if (element_dcterms_date != null) {
+ String dc_terms_date = (element_dcterms_date.attr("content"));
+ metadata.put("dc_terms_date",dc_terms_date);
+
+ }
+
+ Element element_dcterms_type = doc.select("meta[name=\"dcterms.type\"]").first();
+ if (element_dcterms_type != null) {
+ String dc_terms_type = (element_dcterms_type.attr("content"));
+ metadata.put("dc_terms_type",dc_terms_type);
+
+ }
+
+ Element element_dcterms_format = doc.select("meta[name=\"dcterms.format\"]").first();
+ if (element_dcterms_format != null) {
+ String dc_terms_format = (element_dcterms_format.attr("content"));
+ metadata.put("dc_terms_format",dc_terms_format);
+
+ }
+
+ Element element_dcterms_language = doc.select("meta[name=\"dcterms.language\"]").first();
+ if (element_dcterms_language != null) {
+ String dc_terms_language = (element_dcterms_language.attr("content"));
+ metadata.put("dc_terms_language",dc_terms_language);
+
+ }
+
+ Element element_dcterms_identifier = doc.select("meta[name=\"dcterms.identifier\"]").first();
+ if (element_dcterms_identifier != null) {
+ String dc_terms_identifier = (element_dcterms_identifier.attr("content"));
+ metadata.put("dc_terms_identifier",dc_terms_identifier);
+ }
+
+
+ Element docToKeep = doc.body();
+ String finalDoc ;
+
+ // Englobing Tag
+ if (whitelist!="body"){
+ docToKeep = doc.select(whitelist).first();
+ }
+
+
+
+ // Blacklist
+ if (blacklist != null){
+ for (int i=0; i< blacklist.size();i++){
+ docToKeep.select(blacklist.get(i)).remove();
+ }
+ }
+
+ //finalDoc = docToKeep.text();
+ finalDoc = docToKeep.html();
+ metadata.put("extractedDoc",finalDoc);
+
+ return metadata;
+ }
+
+}
+
+
Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/Messages.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/Messages.java?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/Messages.java (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/Messages.java Fri Mar 16 16:31:26 2018
@@ -0,0 +1,125 @@
+/* $Id: Messages.java 1596720 2014-05-22 00:57:29Z kwright $ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package com.francelabs.datafari.htmlextractor;
+
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.manifoldcf.core.interfaces.IHTTPOutput;
+import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
+
+public class Messages extends org.apache.manifoldcf.ui.i18n.Messages {
+ public static final String DEFAULT_BUNDLE_NAME = "com.francelabs.datafari.htmlextractor.common";
+ public static final String DEFAULT_PATH_NAME = "com.francelabs.datafari.htmlextractor";
+
+ /**
+ * Constructor - do no instantiate
+ */
+ protected Messages() {
+ }
+
+ public static String getString(final Locale locale, final String messageKey) {
+ return getString(DEFAULT_BUNDLE_NAME, locale, messageKey, null);
+ }
+
+ public static String getAttributeString(final Locale locale, final String messageKey) {
+ return getAttributeString(DEFAULT_BUNDLE_NAME, locale, messageKey, null);
+ }
+
+ public static String getBodyString(final Locale locale, final String messageKey) {
+ return getBodyString(DEFAULT_BUNDLE_NAME, locale, messageKey, null);
+ }
+
+ public static String getAttributeJavascriptString(final Locale locale, final String messageKey) {
+ return getAttributeJavascriptString(DEFAULT_BUNDLE_NAME, locale, messageKey, null);
+ }
+
+ public static String getBodyJavascriptString(final Locale locale, final String messageKey) {
+ return getBodyJavascriptString(DEFAULT_BUNDLE_NAME, locale, messageKey, null);
+ }
+
+ public static String getString(final Locale locale, final String messageKey, final Object[] args) {
+ return getString(DEFAULT_BUNDLE_NAME, locale, messageKey, args);
+ }
+
+ public static String getAttributeString(final Locale locale, final String messageKey, final Object[] args) {
+ return getAttributeString(DEFAULT_BUNDLE_NAME, locale, messageKey, args);
+ }
+
+ public static String getBodyString(final Locale locale, final String messageKey, final Object[] args) {
+ return getBodyString(DEFAULT_BUNDLE_NAME, locale, messageKey, args);
+ }
+
+ public static String getAttributeJavascriptString(final Locale locale, final String messageKey, final Object[] args) {
+ return getAttributeJavascriptString(DEFAULT_BUNDLE_NAME, locale, messageKey, args);
+ }
+
+ public static String getBodyJavascriptString(final Locale locale, final String messageKey, final Object[] args) {
+ return getBodyJavascriptString(DEFAULT_BUNDLE_NAME, locale, messageKey, args);
+ }
+
+ // More general methods which allow bundlenames and class loaders to be
+ // specified.
+
+ public static String getString(final String bundleName, final Locale locale, final String messageKey,
+ final Object[] args) {
+ return getString(Messages.class, bundleName, locale, messageKey, args);
+ }
+
+ public static String getAttributeString(final String bundleName, final Locale locale, final String messageKey,
+ final Object[] args) {
+ return getAttributeString(Messages.class, bundleName, locale, messageKey, args);
+ }
+
+ public static String getBodyString(final String bundleName, final Locale locale, final String messageKey,
+ final Object[] args) {
+ return getBodyString(Messages.class, bundleName, locale, messageKey, args);
+ }
+
+ public static String getAttributeJavascriptString(final String bundleName, final Locale locale,
+ final String messageKey, final Object[] args) {
+ return getAttributeJavascriptString(Messages.class, bundleName, locale, messageKey, args);
+ }
+
+ public static String getBodyJavascriptString(final String bundleName, final Locale locale, final String messageKey,
+ final Object[] args) {
+ return getBodyJavascriptString(Messages.class, bundleName, locale, messageKey, args);
+ }
+
+ // Resource output
+
+ public static void outputResource(final IHTTPOutput output, final Locale locale, final String resourceKey,
+ final Map<String, String> substitutionParameters, final boolean mapToUpperCase) throws ManifoldCFException {
+ outputResource(output, Messages.class, DEFAULT_PATH_NAME, locale, resourceKey, substitutionParameters,
+ mapToUpperCase);
+ }
+
+ public static void outputResourceWithVelocity(final IHTTPOutput output, final Locale locale, final String resourceKey,
+ final Map<String, String> substitutionParameters, final boolean mapToUpperCase) throws ManifoldCFException {
+ outputResourceWithVelocity(output, Messages.class, DEFAULT_BUNDLE_NAME, DEFAULT_PATH_NAME, locale, resourceKey,
+ substitutionParameters, mapToUpperCase);
+ }
+
+ public static void outputResourceWithVelocity(final IHTTPOutput output, final Locale locale, final String resourceKey,
+ final Map<String, Object> contextObjects) throws ManifoldCFException {
+ outputResourceWithVelocity(output, Messages.class, DEFAULT_BUNDLE_NAME, DEFAULT_PATH_NAME, locale, resourceKey,
+ contextObjects);
+ }
+
+}
Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/HtmlExtractorException.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/HtmlExtractorException.java?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/HtmlExtractorException.java (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/HtmlExtractorException.java Fri Mar 16 16:31:26 2018
@@ -0,0 +1,19 @@
+package com.francelabs.datafari.htmlextractor.exception;
+
+
+public class HtmlExtractorException extends Exception {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ public HtmlExtractorException(final String message) {
+ super(message);
+ }
+
+ public HtmlExtractorException(final String message, final Exception e) {
+ super(message, e);
+ }
+
+}
Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/RegexException.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/RegexException.java?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/RegexException.java (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/RegexException.java Fri Mar 16 16:31:26 2018
@@ -0,0 +1,27 @@
+package com.francelabs.datafari.htmlextractor.exception;
+
+
+public class RegexException extends Exception {
+
+ private String regex = "";
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ public RegexException(final String regex, final String message) {
+ super(message);
+ this.regex = regex;
+ }
+
+ public RegexException(final String regex, final String message, final Exception e) {
+ super(message, e);
+ this.regex = regex;
+ }
+
+ public String getRegex() {
+ return regex;
+ }
+
+}
Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties Fri Mar 16 16:31:26 2018
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName=HTML Extractor
+DatafariHtmlExtractorTransformationConnector.SolrUpdateHandler=Update handler:
+DatafariHtmlExtractorTransformationConnector.IncludeFilters=Englobing tag :
+DatafariHtmlExtractorTransformationConnector.ExcludeFilters=Tags to remove :
+DatafariHtmlExtractorTransformationConnector.RegularExpression=CSS selector
+DatafariHtmlExtractorTransformationConnector.Delete=Delete
+DatafariHtmlExtractorTransformationConnector.DeleteIncludeFilter=Delete englobing tag
+DatafariHtmlExtractorTransformationConnector.DeleteExcludeFilter=Delete blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified=No englobing tag specified
+DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified=No blacklist tag specified
+DatafariHtmlExtractorTransformationConnector.Add=Add
+DatafariHtmlExtractorTransformationConnector.AddIncludeFilter=Add englobing tag
+DatafariHtmlExtractorTransformationConnector.AddExcludeFilter=Add blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoRegexSpecified=No CSS selector specified
Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties Fri Mar 16 16:31:26 2018
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName=HTML Extractor
+DatafariHtmlExtractorTransformationConnector.SolrUpdateHandler=Update handler:
+DatafariHtmlExtractorTransformationConnector.IncludeFilters=Englobing tag :
+DatafariHtmlExtractorTransformationConnector.ExcludeFilters=Tags to remove :
+DatafariHtmlExtractorTransformationConnector.RegularExpression=CSS selector
+DatafariHtmlExtractorTransformationConnector.Delete=Delete
+DatafariHtmlExtractorTransformationConnector.DeleteIncludeFilter=Delete englobing tag
+DatafariHtmlExtractorTransformationConnector.DeleteExcludeFilter=Delete blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified=No englobing tag specified
+DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified=No blacklist tag specified
+DatafariHtmlExtractorTransformationConnector.Add=Add
+DatafariHtmlExtractorTransformationConnector.AddIncludeFilter=Add englobing tag
+DatafariHtmlExtractorTransformationConnector.AddExcludeFilter=Add blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoRegexSpecified=No CSS selector specified
Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties Fri Mar 16 16:31:26 2018
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName=HTML Extractor
+DatafariHtmlExtractorTransformationConnector.SolrUpdateHandler=Update handler:
+DatafariHtmlExtractorTransformationConnector.IncludeFilters=Englobing tag :
+DatafariHtmlExtractorTransformationConnector.ExcludeFilters=Tags to remove :
+DatafariHtmlExtractorTransformationConnector.RegularExpression=CSS selector
+DatafariHtmlExtractorTransformationConnector.Delete=Delete
+DatafariHtmlExtractorTransformationConnector.DeleteIncludeFilter=Delete englobing tag
+DatafariHtmlExtractorTransformationConnector.DeleteExcludeFilter=Delete blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified=No englobing tag specified
+DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified=No blacklist tag specified
+DatafariHtmlExtractorTransformationConnector.Add=Add
+DatafariHtmlExtractorTransformationConnector.AddIncludeFilter=Add englobing tag
+DatafariHtmlExtractorTransformationConnector.AddExcludeFilter=Add blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoRegexSpecified=No CSS selector specified
Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties Fri Mar 16 16:31:26 2018
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName=HTML Extractor
+DatafariHtmlExtractorTransformationConnector.SolrUpdateHandler=Update handler:
+DatafariHtmlExtractorTransformationConnector.IncludeFilters=Englobing tag :
+DatafariHtmlExtractorTransformationConnector.ExcludeFilters=Tags to remove :
+DatafariHtmlExtractorTransformationConnector.RegularExpression=CSS selector
+DatafariHtmlExtractorTransformationConnector.Delete=Delete
+DatafariHtmlExtractorTransformationConnector.DeleteIncludeFilter=Delete englobing tag
+DatafariHtmlExtractorTransformationConnector.DeleteExcludeFilter=Delete blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified=No englobing tag specified
+DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified=No blacklist tag specified
+DatafariHtmlExtractorTransformationConnector.Add=Add
+DatafariHtmlExtractorTransformationConnector.AddIncludeFilter=Add englobing tag
+DatafariHtmlExtractorTransformationConnector.AddExcludeFilter=Add blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoRegexSpecified=No CSS selector specified
Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editConfiguration.js
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editConfiguration.js?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editConfiguration.js (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editConfiguration.js Fri Mar 16 16:31:26 2018
@@ -0,0 +1,27 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<script type="text/javascript">
+<!--
+
+function checkConfig()
+{
+ return true;
+}
+
+//-->
+</script>
Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification.js
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification.js?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification.js (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification.js Fri Mar 16 16:31:26 2018
@@ -0,0 +1,76 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<script type="text/javascript">
+<!--
+function s${SEQNUM}_checkSpecification()
+{
+ return true;
+}
+
+function s${SEQNUM}_addIncludeFilter()
+{
+ if (editjob.s${SEQNUM}_includefilter_regex.value == "")
+ {
+ alert("$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoRegexSpecified'))");
+ editjob.s${SEQNUM}_includefilter_regex.focus();
+ return;
+ }
+ editjob.s${SEQNUM}_includefilter_op.value="Add";
+ postFormSetAnchor("s${SEQNUM}_includefilter");
+}
+
+function s${SEQNUM}_addExcludeFilter()
+{
+ if (editjob.s${SEQNUM}_excludefilter_regex.value == "")
+ {
+ alert("$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoRegexSpecified'))");
+ editjob.s${SEQNUM}_excludefilter_regex.focus();
+ return;
+ }
+ editjob.s${SEQNUM}_excludefilter_op.value="Add";
+ postFormSetAnchor("s${SEQNUM}_excludefilter");
+}
+
+function s${SEQNUM}_deleteIncludeFilter(i)
+{
+ // Set the operation
+ eval("editjob.s${SEQNUM}_includefilter_op_"+i+".value=\"Delete\"");
+ // Submit
+ if (editjob.s${SEQNUM}_includefilter_count.value==i)
+ postFormSetAnchor("s${SEQNUM}_includefilter");
+ else
+ postFormSetAnchor("s${SEQNUM}_includefilter_"+i)
+ // Undo, so we won't get two deletes next time
+ eval("editjob.s${SEQNUM}_includefilter_op_"+i+".value=\"Continue\"");
+}
+
+function s${SEQNUM}_deleteExcludeFilter(i)
+{
+ // Set the operation
+ eval("editjob.s${SEQNUM}_excludefilter_op_"+i+".value=\"Delete\"");
+ // Submit
+ if (editjob.s${SEQNUM}_excludefilter_count.value==i)
+ postFormSetAnchor("s${SEQNUM}_excludefilter");
+ else
+ postFormSetAnchor("s${SEQNUM}_excludefilter_"+i)
+ // Undo, so we won't get two deletes next time
+ eval("editjob.s${SEQNUM}_excludefilter_op_"+i+".value=\"Continue\"");
+}
+
+//-->
+</script>
Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html Fri Mar 16 16:31:26 2018
@@ -0,0 +1,148 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+#if($TABNAME == $ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName') && ${SEQNUM} == ${SELECTEDNUM})
+
+<table class="displaytable">
+<tr>
+ <td class="description">
+ <nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.IncludeFilters'))</nobr>
+ </td>
+ <td class="boxcell">
+ <table class="formtable">
+ <tr class="formheaderrow">
+ <td class="formcolumnheader"></td>
+ <td class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.RegularExpression'))</nobr></td>
+ </tr>
+
+ #set($includecounter = 0)
+ #foreach($includefilter in $INCLUDEFILTERS)
+ #if(($includecounter) < 2)
+ #set($includecounterdisplay = $includecounter + 1)
+ #if(($includecounter % 2) == 0)
+ <tr class="evenformrow">
+ #else
+ <tr class="oddformrow">
+ #end
+ <td class="formcolumncell">
+ <a name="s${SEQNUM}_includefilter_$includecounter">
+ <input type="button" value="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.Delete'))" alt="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.DeleteIncludeFilter'))$includecounterdisplay" onclick='javascript:s${SEQNUM}_deleteIncludeFilter("$includecounter");'/>
+ <input type="hidden" name="s${SEQNUM}_includefilter_op_$includecounter" value="Continue"/>
+ </a>
+ </td>
+ <td class="formcolumncell">
+ <nobr>$Encoder.bodyEscape($includefilter)</nobr>
+ </td>
+ </tr>
+ #set($includecounter = $includecounter + 1)
+ #end
+ #end
+
+
+ #if($includecounter == 0)
+ <tr class="formrow"><td class="formmessage" colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified'))</td></tr>
+ #end
+
+ <tr class="formrow"><td class="formseparator" colspan="3"><hr/></td></tr>
+ <tr class="formrow">
+ <td class="formcolumncell">
+ <a name="includefilter">
+ <input type="button" value="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.Add'))" alt="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.AddIncludeFilter'))" onclick="javascript:s${SEQNUM}_addIncludeFilter();"/>
+ </a>
+ <input type="hidden" name="s${SEQNUM}_includefilter_count" value="$includecounter"/>
+ <input type="hidden" name="s${SEQNUM}_includefilter_op" value="Continue"/>
+ </td>
+ <td class="formcolumncell">
+ <nobr><input type="text" size="15" name="s${SEQNUM}_includefilter_regex" value=""/></nobr>
+ </td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="description">
+ <nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.ExcludeFilters'))</nobr>
+ </td>
+ <td class="boxcell">
+ <table class="formtable">
+ <tr class="formheaderrow">
+ <td class="formcolumnheader"></td>
+ <td class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.RegularExpression'))</nobr></td>
+ </tr>
+
+ #set($excludecounter = 0)
+ #foreach($excludefilter in $EXCLUDEFILTERS)
+ #set($excludecounterdisplay = $excludecounter + 1)
+ #if(($excludecounter % 2) == 0)
+ <tr class="evenformrow">
+ #else
+ <tr class="oddformrow">
+ #end
+ <td class="formcolumncell">
+ <a name="s${SEQNUM}_excludefilter_$excludecounter">
+ <input type="button" value="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.Delete'))" alt="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.DeleteExcludeFilter'))$excludecounterdisplay" onclick='javascript:s${SEQNUM}_deleteExcludeFilter("$excludecounter");'/>
+ <input type="hidden" name="s${SEQNUM}_excludefilter_op_$excludecounter" value="Continue"/>
+ <input type="hidden" name="s${SEQNUM}_excludefilter_regex_$excludecounter" value="$Encoder.attributeEscape($excludefilter)"/>
+ </a>
+ </td>
+ <td class="formcolumncell">
+ <nobr>$Encoder.bodyEscape($excludefilter)</nobr>
+ </td>
+ </tr>
+ #set($excludecounter = $excludecounter + 1)
+ #end
+
+ #if($excludecounter == 0)
+ <tr class="formrow"><td class="formmessage" colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified'))</td></tr>
+ #end
+
+ <tr class="formrow"><td class="formseparator" colspan="3"><hr/></td></tr>
+ <tr class="formrow">
+ <td class="formcolumncell">
+ <a name="excludefilter">
+ <input type="button" value="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.Add'))" alt="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.AddExcludeFilter'))" onclick="javascript:s${SEQNUM}_addExcludeFilter();"/>
+ </a>
+ <input type="hidden" name="s${SEQNUM}_excludefilter_count" value="$excludecounter"/>
+ <input type="hidden" name="s${SEQNUM}_excludefilter_op" value="Continue"/>
+ </td>
+ <td class="formcolumncell">
+ <nobr><input type="text" size="15" name="s${SEQNUM}_excludefilter_regex" value=""/></nobr>
+ </td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+</table>
+
+#else
+
+ #set($includecounter = 0)
+ #foreach($includefilter in $INCLUDEFILTERS)
+<input type="hidden" name="s${SEQNUM}_includefilter_regex_$includecounter" value="$Encoder.attributeEscape($includefilter)"/>
+ #set($includecounter = $includecounter + 1)
+ #end
+<input type="hidden" name="s${SEQNUM}_includefilter_count" value="$includecounter"/>
+
+ #set($excludecounter = 0)
+ #foreach($excludefilter in $EXCLUDEFILTERS)
+<input type="hidden" name="s${SEQNUM}_excludefilter_regex_$excludecounter" value="$Encoder.attributeEscape($excludefilter)"/>
+ #set($excludecounter = $excludecounter + 1)
+ #end
+<input type="hidden" name="s${SEQNUM}_excludefilter_count" value="$excludecounter"/>
+
+
+#end
Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewConfiguration.html
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewConfiguration.html?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewConfiguration.html (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewConfiguration.html Fri Mar 16 16:31:26 2018
@@ -0,0 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<table class="displaytable">
+
+</table>
Added: manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html?rev=1827009&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html (added)
+++ manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html Fri Mar 16 16:31:26 2018
@@ -0,0 +1,82 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+<table class="displaytable">
+ <tr>
+
+ <tr>
+ <td class="description">
+ <nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName')) $Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.IncludeFilters'))</nobr>
+ </td>
+ <td class="boxcell">
+ <table class="formtable">
+ <tr class="formheaderrow">
+ <td class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.RegularExpression'))</nobr></td>
+ </tr>
+
+ #set($includecounter = 0)
+ #foreach($includefilter in $INCLUDEFILTERS)
+ #if(($includecounter % 2) == 0)
+ <tr class="evenformrow">
+ #else
+ <tr class="oddformrow">
+ #end
+ <td class="formcolumncell">
+ <nobr>$Encoder.bodyEscape($includefilter)</nobr>
+ </td>
+ </tr>
+ #set($includecounter = $includecounter + 1)
+ #end
+
+ #if($includecounter == 0)
+ <tr class="formrow"><td class="formmessage" colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified'))</td></tr>
+ #end
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="description">
+ <nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName')) $Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.ExcludeFilters'))</nobr>
+ </td>
+ <td class="boxcell">
+ <table class="formtable">
+ <tr class="formheaderrow">
+ <td class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.RegularExpression'))</nobr></td>
+ </tr>
+
+ #set($excludecounter = 0)
+ #foreach($excludefilter in $EXCLUDEFILTERS)
+ #if(($excludecounter % 2) == 0)
+ <tr class="evenformrow">
+ #else
+ <tr class="oddformrow">
+ #end
+ <td class="formcolumncell">
+ <nobr>$Encoder.bodyEscape($excludefilter)</nobr>
+ </td>
+ </tr>
+ #set($excludecounter = $excludecounter + 1)
+ #end
+
+ #if($excludecounter == 0)
+ <tr class="formrow"><td class="formmessage" colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified'))</td></tr>
+ #end
+ </table>
+ </td>
+ </tr>
+</table>