You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/10/28 13:36:11 UTC

svn commit: r1634857 - in /manifoldcf/trunk: ./ connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/ connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/ connectors/tika/...

Author: kwright
Date: Tue Oct 28 12:36:11 2014
New Revision: 1634857

URL: http://svn.apache.org/r1634857
Log:
Fix for CONNECTORS-1088.

Added:
    manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Boilerplate.html   (with props)
Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
    manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
    manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
    manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
    manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Exceptions.html
    manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1634857&r1=1634856&r2=1634857&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue Oct 28 12:36:11 2014
@@ -3,6 +3,9 @@ $Id$
 
 ======================= 2.0-dev =====================
 
+CONNECTORS-1088: Add boilerplate extraction to Tika extractor.
+(Arcadius Ahouansou, Karl Wright)
+
 CONNECTORS-1087: Fix failing alfresco-webscript unit test.
 (Karl Wright)
 

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java?rev=1634857&r1=1634856&r2=1634857&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java Tue Oct 28 12:36:11 2014
@@ -47,6 +47,7 @@ public class TikaExtractor extends org.a
   private static final String EDIT_SPECIFICATION_JS = "editSpecification.js";
   private static final String EDIT_SPECIFICATION_FIELDMAPPING_HTML = "editSpecification_FieldMapping.html";
   private static final String EDIT_SPECIFICATION_EXCEPTIONS_HTML = "editSpecification_Exceptions.html";
+  private static final String EDIT_SPECIFICATION_BOILERPLATE_HTML = "editSpecification_Boilerplate.html";
   private static final String VIEW_SPECIFICATION_HTML = "viewSpecification.html";
 
   protected static final String ACTIVITY_EXTRACT = "extract";
@@ -373,10 +374,12 @@ public class TikaExtractor extends org.a
 
     tabsArray.add(Messages.getString(locale, "TikaExtractor.FieldMappingTabName"));
     tabsArray.add(Messages.getString(locale, "TikaExtractor.ExceptionsTabName"));
+    tabsArray.add(Messages.getString(locale, "TikaExtractor.BoilerplateTabName"));
 
     // Fill in the specification header map, using data from all tabs.
     fillInFieldMappingSpecificationMap(paramMap, os);
     fillInExceptionsSpecificationMap(paramMap, os);
+    fillInBoilerplateSpecificationMap(paramMap, os);
     
     Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_JS,paramMap);
   }
@@ -407,9 +410,11 @@ public class TikaExtractor extends org.a
     // Fill in the field mapping tab data
     fillInFieldMappingSpecificationMap(paramMap, os);
     fillInExceptionsSpecificationMap(paramMap, os);
-
+    fillInBoilerplateSpecificationMap(paramMap, os);
+    
     Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_FIELDMAPPING_HTML,paramMap);
     Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_EXCEPTIONS_HTML,paramMap);
+    Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_BOILERPLATE_HTML,paramMap);
   }
 
   /** Process a specification post.
@@ -514,6 +519,27 @@ public class TikaExtractor extends org.a
       os.addChild(os.getChildCount(), node);
     }
     
+    x = variableContext.getParameter(seqPrefix+"boilerplateclassname");
+    if (x != null)
+    {
+      int i = 0;
+      while (i < os.getChildCount())
+      {
+        SpecificationNode node = os.getChild(i);
+        if (node.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR))
+          os.removeChild(i);
+        else
+          i++;
+      }
+
+      if (x.length() > 0)
+      {
+        SpecificationNode node = new SpecificationNode(TikaConfig.NODE_BOILERPLATEPROCESSOR);
+        node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, x);
+        os.addChild(os.getChildCount(), node);
+      }
+    }
+    
     return null;
   }
   
@@ -537,6 +563,7 @@ public class TikaExtractor extends org.a
     // Fill in the map with data from all tabs
     fillInFieldMappingSpecificationMap(paramMap, os);
     fillInExceptionsSpecificationMap(paramMap, os);
+    fillInBoilerplateSpecificationMap(paramMap, os);
 
     Messages.outputResourceWithVelocity(out,locale,VIEW_SPECIFICATION_HTML,paramMap);
     
@@ -590,6 +617,20 @@ public class TikaExtractor extends org.a
     paramMap.put("IGNORETIKAEXCEPTIONS",ignoreTikaExceptions);
   }
 
+  protected static void fillInBoilerplateSpecificationMap(Map<String,Object> paramMap, Specification os)
+  {
+    String boilerplateClassName = "";
+    for (int i = 0; i < os.getChildCount(); i++)
+    {
+      SpecificationNode sn = os.getChild(i);
+      if (sn.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR))
+      {
+        boilerplateClassName = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+      }
+    }
+    paramMap.put("BOILERPLATECLASSNAME",boilerplateClassName);
+  }
+
   protected static int handleTikaException(TikaException e)
     throws IOException, ManifoldCFException, ServiceInterruption
   {

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties?rev=1634857&r1=1634856&r2=1634857&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties Tue Oct 28 12:36:11 2014
@@ -15,6 +15,15 @@
 
 TikaExtractor.FieldMappingTabName=Field mapping
 TikaExtractor.ExceptionsTabName=Exceptions
+TikaExtractor.BoilerplateTabName=Boilerplate
+TikaExtractor.BoilerplateExtractorColon=Boilerplate extractor:
+TikaExtractor.NoExtractionSelected=-- No extraction selected --
+TikaExtractor.ExtractArticles=Extract articles
+TikaExtractor.ExtractArticleSentences=Extract article sentences
+TikaExtractor.BasicExtraction=Basic general-purpose extraction
+TikaExtractor.ExtractEverything=Extract everything
+TikaExtractor.ExtractLargestTextComponent=Extract the largest text component of the document
+TikaExtractor.ExtractNumWords=Extract based on number of words per block
 TikaExtractor.FieldMappings=Field mappings:
 TikaExtractor.MetadataFieldName=Metadata field name
 TikaExtractor.FinalFieldName=Final field name

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties?rev=1634857&r1=1634856&r2=1634857&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties Tue Oct 28 12:36:11 2014
@@ -15,6 +15,15 @@
 
 TikaExtractor.FieldMappingTabName=フィールドマッピング
 TikaExtractor.ExceptionsTabName=例外
+TikaExtractor.BoilerplateTabName=Boilerplate
+TikaExtractor.BoilerplateExtractorColon=Boilerplate extractor:
+TikaExtractor.NoExtractionSelected=-- No extraction selected --
+TikaExtractor.ExtractArticles=Extract articles
+TikaExtractor.ExtractArticleSentences=Extract article sentences
+TikaExtractor.BasicExtraction=Basic general-purpose extraction
+TikaExtractor.ExtractEverything=Extract everything
+TikaExtractor.ExtractLargestTextComponent=Extract the largest text component of the document
+TikaExtractor.ExtractNumWords=Extract based on number of words per block
 TikaExtractor.FieldMappings=フィールドマッピング:
 TikaExtractor.MetadataFieldName=メタデータフィールド名
 TikaExtractor.FinalFieldName=最後のフィールド名

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties?rev=1634857&r1=1634856&r2=1634857&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties Tue Oct 28 12:36:11 2014
@@ -15,6 +15,15 @@
 
 TikaExtractor.FieldMappingTabName=字段映射
 TikaExtractor.ExceptionsTabName=异常
+TikaExtractor.BoilerplateTabName=Boilerplate
+TikaExtractor.BoilerplateExtractorColon=Boilerplate extractor:
+TikaExtractor.NoExtractionSelected=-- No extraction selected --
+TikaExtractor.ExtractArticles=Extract articles
+TikaExtractor.ExtractArticleSentences=Extract article sentences
+TikaExtractor.BasicExtraction=Basic general-purpose extraction
+TikaExtractor.ExtractEverything=Extract everything
+TikaExtractor.ExtractLargestTextComponent=Extract the largest text component of the document
+TikaExtractor.ExtractNumWords=Extract based on number of words per block
 TikaExtractor.FieldMappings=字段映射:
 TikaExtractor.MetadataFieldName=元数据字段名
 TikaExtractor.FinalFieldName=最终字段名

Added: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Boilerplate.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Boilerplate.html?rev=1634857&view=auto
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Boilerplate.html (added)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Boilerplate.html Tue Oct 28 12:36:11 2014
@@ -0,0 +1,70 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+#if($TABNAME == $ResourceBundle.getString('TikaExtractor.BoilerplateTabName') && ${SEQNUM} == ${SELECTEDNUM})
+
+<table class="displaytable">
+  <tr><td class="separator" colspan="2"><hr/></td></tr>
+  <tr>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.BoilerplateExtractorColon'))</nobr></td>
+    <td class="value">
+      <select name="s${SEQNUM}_boilerplateclassname">
+  #if($BOILERPLATECLASSNAME == '')
+        <option value="" selected="true">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.NoExtractionSelected'))</option>
+  #else
+        <option value="">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.NoExtractionSelected'))</option>
+  #end
+  #if($BOILERPLATECLASSNAME == 'de.l3s.boilerpipe.extractors.ArticleExtractor')
+        <option value="de.l3s.boilerpipe.extractors.ArticleExtractor" selected="true">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractArticles'))</option>
+  #else
+        <option value="de.l3s.boilerpipe.extractors.ArticleExtractor">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractArticles'))</option>
+  #end
+  #if($BOILERPLATECLASSNAME == 'de.l3s.boilerpipe.extractors.ArticleSentencesExtractor')
+        <option value="de.l3s.boilerpipe.extractors.ArticleSentencesExtractor" selected="true">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractArticleSentences'))</option>
+  #else
+        <option value="de.l3s.boilerpipe.extractors.ArticleSentencesExtractor">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractArticleSentences'))</option>
+  #end
+  #if($BOILERPLATECLASSNAME == 'de.l3s.boilerpipe.extractors.DefaultExtractor')
+        <option value="de.l3s.boilerpipe.extractors.DefaultExtractor" selected="true">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.BasicExtraction'))</option>
+  #else
+        <option value="de.l3s.boilerpipe.extractors.DefaultExtractor">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.BasicExtraction'))</option>
+  #end
+  #if($BOILERPLATECLASSNAME == 'de.l3s.boilerpipe.extractors.KeepEverythingExtractor')
+        <option value="de.l3s.boilerpipe.extractors.KeepEverythingExtractor" selected="true">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractEverything'))</option>
+  #else
+        <option value="de.l3s.boilerpipe.extractors.KeepEverythingExtractor">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractEverything'))</option>
+  #end
+  #if($BOILERPLATECLASSNAME == 'de.l3s.boilerpipe.extractors.LargestContentExtractor')
+        <option value="de.l3s.boilerpipe.extractors.LargestContentExtractor" selected="true">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractLargestTextComponent'))</option>
+  #else
+        <option value="de.l3s.boilerpipe.extractors.LargestContentExtractor">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractLargestTextComponent'))</option>
+  #end
+  #if($BOILERPLATECLASSNAME == 'de.l3s.boilerpipe.extractors.NumWordsRulesExtractor')
+        <option value="de.l3s.boilerpipe.extractors.NumWordsRulesExtractor" selected="true">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractNumWords'))</option>
+  #else
+        <option value="de.l3s.boilerpipe.extractors.NumWordsRulesExtractor">$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.ExtractNumWords'))</option>
+  #end
+      </select>
+    </td>
+  </tr>
+</table>
+      
+#else
+
+<input type="hidden" name="s${SEQNUM}_boilerplateclassname" value="$Encoder.attributeEscape($BOILERPLATECLASSNAME)"/>
+
+#end
\ No newline at end of file

Propchange: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Boilerplate.html
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Boilerplate.html
------------------------------------------------------------------------------
    svn:keywords = Id

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Exceptions.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Exceptions.html?rev=1634857&r1=1634856&r2=1634857&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Exceptions.html (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Exceptions.html Tue Oct 28 12:36:11 2014
@@ -35,6 +35,6 @@
 #else
 
 <input type="hidden" name="s${SEQNUM}_ignoretikaexceptions_present" value="true"/>
-<input type="hidden" name="s${SEQNUM}_ignoretikaexceptions" value="$Encoder.bodyEscape($IGNORETIKAEXCEPTIONS)"/>
+<input type="hidden" name="s${SEQNUM}_ignoretikaexceptions" value="$Encoder.attributeEscape($IGNORETIKAEXCEPTIONS)"/>
 
 #end
\ No newline at end of file

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html?rev=1634857&r1=1634856&r2=1634857&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html Tue Oct 28 12:36:11 2014
@@ -56,5 +56,16 @@
     <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.IgnoreTikaExceptions'))</nobr></td>
     <td class="value"><nobr>$Encoder.bodyEscape($IGNORETIKAEXCEPTIONS)</nobr></td>
   </tr>
+  <tr><td class="separator" colspan="2"><hr/></td></tr>
+  <tr>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.BoilerplateExtractorColon'))</nobr></td>
+    <td class="value">
+#if ($BOILERPLATECLASSNAME == '')
+      <nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.NoExtractionSelected'))</nobr>
+#else
+      <nobr>$Encoder.bodyEscape($BOILERPLATECLASSNAME)</nobr>
+#end
+    </td>
+  </tr>
 
 </table>