You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/10/28 13:36:11 UTC
svn commit: r1634857 - in /manifoldcf/trunk: ./
connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/
connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/
connectors/tika/...
Author: kwright
Date: Tue Oct 28 12:36:11 2014
New Revision: 1634857
URL: http://svn.apache.org/r1634857
Log:
Fix for CONNECTORS-1088.
Added:
manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Boilerplate.html (with props)
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Exceptions.html
manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1634857&r1=1634856&r2=1634857&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue Oct 28 12:36:11 2014
@@ -3,6 +3,9 @@ $Id$
======================= 2.0-dev =====================
+CONNECTORS-1088: Add boilerplate extraction to Tika extractor.
+(Arcadius Ahouansou, Karl Wright)
+
CONNECTORS-1087: Fix failing alfresco-webscript unit test.
(Karl Wright)
Modified: manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java?rev=1634857&r1=1634856&r2=1634857&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java Tue Oct 28 12:36:11 2014
@@ -47,6 +47,7 @@ public class TikaExtractor extends org.a
private static final String EDIT_SPECIFICATION_JS = "editSpecification.js";
private static final String EDIT_SPECIFICATION_FIELDMAPPING_HTML = "editSpecification_FieldMapping.html";
private static final String EDIT_SPECIFICATION_EXCEPTIONS_HTML = "editSpecification_Exceptions.html";
+ private static final String EDIT_SPECIFICATION_BOILERPLATE_HTML = "editSpecification_Boilerplate.html";
private static final String VIEW_SPECIFICATION_HTML = "viewSpecification.html";
protected static final String ACTIVITY_EXTRACT = "extract";
@@ -373,10 +374,12 @@ public class TikaExtractor extends org.a
tabsArray.add(Messages.getString(locale, "TikaExtractor.FieldMappingTabName"));
tabsArray.add(Messages.getString(locale, "TikaExtractor.ExceptionsTabName"));
+ tabsArray.add(Messages.getString(locale, "TikaExtractor.BoilerplateTabName"));
// Fill in the specification header map, using data from all tabs.
fillInFieldMappingSpecificationMap(paramMap, os);
fillInExceptionsSpecificationMap(paramMap, os);
+ fillInBoilerplateSpecificationMap(paramMap, os);
Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_JS,paramMap);
}
@@ -407,9 +410,11 @@ public class TikaExtractor extends org.a
// Fill in the field mapping tab data
fillInFieldMappingSpecificationMap(paramMap, os);
fillInExceptionsSpecificationMap(paramMap, os);
-
+ fillInBoilerplateSpecificationMap(paramMap, os);
+
Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_FIELDMAPPING_HTML,paramMap);
Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_EXCEPTIONS_HTML,paramMap);
+ Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_BOILERPLATE_HTML,paramMap);
}
/** Process a specification post.
@@ -514,6 +519,27 @@ public class TikaExtractor extends org.a
os.addChild(os.getChildCount(), node);
}
+ x = variableContext.getParameter(seqPrefix+"boilerplateclassname");
+ if (x != null)
+ {
+ int i = 0;
+ while (i < os.getChildCount())
+ {
+ SpecificationNode node = os.getChild(i);
+ if (node.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR))
+ os.removeChild(i);
+ else
+ i++;
+ }
+
+ if (x.length() > 0)
+ {
+ SpecificationNode node = new SpecificationNode(TikaConfig.NODE_BOILERPLATEPROCESSOR);
+ node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, x);
+ os.addChild(os.getChildCount(), node);
+ }
+ }
+
return null;
}
@@ -537,6 +563,7 @@ public class TikaExtractor extends org.a
// Fill in the map with data from all tabs
fillInFieldMappingSpecificationMap(paramMap, os);
fillInExceptionsSpecificationMap(paramMap, os);
+ fillInBoilerplateSpecificationMap(paramMap, os);
Messages.outputResourceWithVelocity(out,locale,VIEW_SPECIFICATION_HTML,paramMap);
@@ -590,6 +617,20 @@ public class TikaExtractor extends org.a
paramMap.put("IGNORETIKAEXCEPTIONS",ignoreTikaExceptions);
}
+ protected static void fillInBoilerplateSpecificationMap(Map<String,Object> paramMap, Specification os)
+ {
+ String boilerplateClassName = "";
+ for (int i = 0; i < os.getChildCount(); i++)
+ {
+ SpecificationNode sn = os.getChild(i);
+ if (sn.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR))
+ {
+ boilerplateClassName = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+ }
+ }
+ paramMap.put("BOILERPLATECLASSNAME",boilerplateClassName);
+ }
+
protected static int handleTikaException(TikaException e)
throws IOException, ManifoldCFException, ServiceInterruption
{
Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties?rev=1634857&r1=1634856&r2=1634857&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties Tue Oct 28 12:36:11 2014
@@ -15,6 +15,15 @@
TikaExtractor.FieldMappingTabName=Field mapping
TikaExtractor.ExceptionsTabName=Exceptions
+TikaExtractor.BoilerplateTabName=Boilerplate
+TikaExtractor.BoilerplateExtractorColon=Boilerplate extractor:
+TikaExtractor.NoExtractionSelected=-- No extraction selected --
+TikaExtractor.ExtractArticles=Extract articles
+TikaExtractor.ExtractArticleSentences=Extract article sentences
+TikaExtractor.BasicExtraction=Basic general-purpose extraction
+TikaExtractor.ExtractEverything=Extract everything
+TikaExtractor.ExtractLargestTextComponent=Extract the largest text component of the document
+TikaExtractor.ExtractNumWords=Extract based on number of words per block
TikaExtractor.FieldMappings=Field mappings:
TikaExtractor.MetadataFieldName=Metadata field name
TikaExtractor.FinalFieldName=Final field name
Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties?rev=1634857&r1=1634856&r2=1634857&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties Tue Oct 28 12:36:11 2014
@@ -15,6 +15,15 @@
TikaExtractor.FieldMappingTabName=ãã£ã¼ã«ããããã³ã°
TikaExtractor.ExceptionsTabName=ä¾å¤
+TikaExtractor.BoilerplateTabName=Boilerplate
+TikaExtractor.BoilerplateExtractorColon=Boilerplate extractor:
+TikaExtractor.NoExtractionSelected=-- No extraction selected --
+TikaExtractor.ExtractArticles=Extract articles
+TikaExtractor.ExtractArticleSentences=Extract article sentences
+TikaExtractor.BasicExtraction=Basic general-purpose extraction
+TikaExtractor.ExtractEverything=Extract everything
+TikaExtractor.ExtractLargestTextComponent=Extract the largest text component of the document
+TikaExtractor.ExtractNumWords=Extract based on number of words per block
TikaExtractor.FieldMappings=ãã£ã¼ã«ããããã³ã°:
TikaExtractor.MetadataFieldName=ã¡ã¿ãã¼ã¿ãã£ã¼ã«ãå
TikaExtractor.FinalFieldName=æå¾ã®ãã£ã¼ã«ãå
Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties?rev=1634857&r1=1634856&r2=1634857&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties Tue Oct 28 12:36:11 2014
@@ -15,6 +15,15 @@
TikaExtractor.FieldMappingTabName=å段æ å°
TikaExtractor.ExceptionsTabName=å¼å¸¸
+TikaExtractor.BoilerplateTabName=Boilerplate
+TikaExtractor.BoilerplateExtractorColon=Boilerplate extractor:
+TikaExtractor.NoExtractionSelected=-- No extraction selected --
+TikaExtractor.ExtractArticles=Extract articles
+TikaExtractor.ExtractArticleSentences=Extract article sentences
+TikaExtractor.BasicExtraction=Basic general-purpose extraction
+TikaExtractor.ExtractEverything=Extract everything
+TikaExtractor.ExtractLargestTextComponent=Extract the largest text component of the document
+TikaExtractor.ExtractNumWords=Extract based on number of words per block
TikaExtractor.FieldMappings=å段æ å°:
TikaExtractor.MetadataFieldName=å
æ°æ®å段å
TikaExtractor.FinalFieldName=æç»å段å
Added: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Boilerplate.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Boilerplate.html?rev=1634857&view=auto
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Boilerplate.html (added)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Boilerplate.html Tue Oct 28 12:36:11 2014
@@ -0,0 +1,70 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+#if($TABNAME == $ResourceBundle.getString('TikaExtractor.BoilerplateTabName') && ${SEQNUM} == ${SELECTEDNUM})
+
+<table class="displaytable">
+ <tr><td class="separator" colspan="2"><hr/></td></tr>
+ <tr>
+ <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.BoilerplateExtractorColon'))</nobr></td>
+ <td class="value">
+ <select name="s${SEQNUM}_boilerplateclassname">
+ #if($BOILERPLATECLASSNAME == '')
+ <option value="" selected="true">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.NoExtractionSelected'))</option>
+ #else
+ <option value="">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.NoExtractionSelected'))</option>
+ #end
+ #if($BOILERPLATECLASSNAME == 'de.l3s.boilerpipe.extractors.ArticleExtractor')
+ <option value="de.l3s.boilerpipe.extractors.ArticleExtractor" selected="true">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractArticles'))</option>
+ #else
+ <option value="de.l3s.boilerpipe.extractors.ArticleExtractor">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractArticles'))</option>
+ #end
+ #if($BOILERPLATECLASSNAME == 'de.l3s.boilerpipe.extractors.ArticleSentencesExtractor')
+ <option value="de.l3s.boilerpipe.extractors.ArticleSentencesExtractor" selected="true">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractArticleSentences'))</option>
+ #else
+ <option value="de.l3s.boilerpipe.extractors.ArticleSentencesExtractor">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractArticleSentences'))</option>
+ #end
+ #if($BOILERPLATECLASSNAME == 'de.l3s.boilerpipe.extractors.DefaultExtractor')
+ <option value="de.l3s.boilerpipe.extractors.DefaultExtractor" selected="true">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.BasicExtraction'))</option>
+ #else
+ <option value="de.l3s.boilerpipe.extractors.DefaultExtractor">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.BasicExtraction'))</option>
+ #end
+ #if($BOILERPLATECLASSNAME == 'de.l3s.boilerpipe.extractors.KeepEverythingExtractor')
+ <option value="de.l3s.boilerpipe.extractors.KeepEverythingExtractor" selected="true">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractEverything'))</option>
+ #else
+ <option value="de.l3s.boilerpipe.extractors.KeepEverythingExtractor">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractEverything'))</option>
+ #end
+ #if($BOILERPLATECLASSNAME == 'de.l3s.boilerpipe.extractors.LargestContentExtractor')
+ <option value="de.l3s.boilerpipe.extractors.LargestContentExtractor" selected="true">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractLargestTextComponent'))</option>
+ #else
+ <option value="de.l3s.boilerpipe.extractors.LargestContentExtractor">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractLargestTextComponent'))</option>
+ #end
+ #if($BOILERPLATECLASSNAME == 'de.l3s.boilerpipe.extractors.NumWordsRulesExtractor')
+ <option value="de.l3s.boilerpipe.extractors.NumWordsRulesExtractor" selected="true">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractNumWords'))</option>
+ #else
+ <option value="de.l3s.boilerpipe.extractors.NumWordsRulesExtractor">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractNumWords'))</option>
+ #end
+ </select>
+ </td>
+ </tr>
+</table>
+
+#else
+
+<input type="hidden" name="s${SEQNUM}_boilerplateclassname" value="$Encoder.attributeEscape($BOILERPLATECLASSNAME)"/>
+
+#end
\ No newline at end of file
Propchange: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Boilerplate.html
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Boilerplate.html
------------------------------------------------------------------------------
svn:keywords = Id
Modified: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Exceptions.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Exceptions.html?rev=1634857&r1=1634856&r2=1634857&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Exceptions.html (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Exceptions.html Tue Oct 28 12:36:11 2014
@@ -35,6 +35,6 @@
#else
<input type="hidden" name="s${SEQNUM}_ignoretikaexceptions_present" value="true"/>
-<input type="hidden" name="s${SEQNUM}_ignoretikaexceptions" value="$Encoder.bodyEscape($IGNORETIKAEXCEPTIONS)"/>
+<input type="hidden" name="s${SEQNUM}_ignoretikaexceptions" value="$Encoder.attributeEscape($IGNORETIKAEXCEPTIONS)"/>
#end
\ No newline at end of file
Modified: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html?rev=1634857&r1=1634856&r2=1634857&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html Tue Oct 28 12:36:11 2014
@@ -56,5 +56,16 @@
<td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.IgnoreTikaExceptions'))</nobr></td>
<td class="value"><nobr>$Encoder.bodyEscape($IGNORETIKAEXCEPTIONS)</nobr></td>
</tr>
+ <tr><td class="separator" colspan="2"><hr/></td></tr>
+ <tr>
+ <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.BoilerplateExtractorColon'))</nobr></td>
+ <td class="value">
+#if ($BOILERPLATECLASSNAME == '')
+ <nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.NoExtractionSelected'))</nobr>
+#else
+ <nobr>$Encoder.bodyEscape($BOILERPLATECLASSNAME)</nobr>
+#end
+ </td>
+ </tr>
</table>