You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by sh...@apache.org on 2015/06/30 05:34:25 UTC

svn commit: r1688348 - in /manifoldcf/trunk: ./ connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/ connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/ connectors/tika/...

Author: shinichiro
Date: Tue Jun 30 03:34:24 2015
New Revision: 1688348

URL: http://svn.apache.org/r1688348
Log:
Fix for CONNECTORS-1218

Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
    manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
    manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
    manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
    manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
    manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html
    manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1688348&r1=1688347&r2=1688348&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue Jun 30 03:34:24 2015
@@ -3,6 +3,9 @@ $Id$
 
 ======================= 2.2-dev =====================
 
+CONNECTORS-1218: Add lowerNames option on Tika extractor.
+(Shinichiro Abe)
+
 CONNECTORS-1217: Fix documentation for api login parameters.
 (Karl Wright)
 

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java?rev=1688348&r1=1688347&r2=1688348&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java Tue Jun 30 03:34:24 2015
@@ -29,6 +29,7 @@ public class TikaConfig {
   // Specification nodes and values
   public static final String NODE_FIELDMAP = "fieldmap";
   public static final String NODE_KEEPMETADATA = "keepAllMetadata";
+  public static final String NODE_LOWERNAMES = "lowerNames";
   public static final String NODE_IGNORETIKAEXCEPTION = "ignoreException";
   public static final String NODE_BOILERPLATEPROCESSOR = "boilerplateprocessor";
   public static final String ATTRIBUTE_SOURCE = "source";

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java?rev=1688348&r1=1688347&r2=1688348&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java Tue Jun 30 03:34:24 2015
@@ -33,7 +33,9 @@ import org.apache.tika.parser.AutoDetect
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.metadata.TikaMetadataKeys;
 import org.apache.tika.parser.html.BoilerpipeContentHandler;
+
 import de.l3s.boilerpipe.BoilerpipeExtractor;
+
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -306,6 +308,17 @@ public class TikaExtractor extends org.a
         String[] metaNames = metadata.names();
         for(String mName : metaNames){
           String value = metadata.get(mName);
+          if (sp.lowerNames())
+          {
+            StringBuilder sb = new StringBuilder();
+            for (int i=0; i<mName.length(); i++) {
+              char ch = mName.charAt(i);
+              if (!Character.isLetterOrDigit(ch)) ch='_';
+              else ch=Character.toLowerCase(ch);
+              sb.append(ch);
+            }
+            mName = sb.toString();
+          }
           String target = sp.getMapping(mName);
           if(target!=null)
           {
@@ -443,7 +456,9 @@ public class TikaExtractor extends org.a
       while (i < os.getChildCount())
       {
         SpecificationNode node = os.getChild(i);
-        if (node.getType().equals(TikaConfig.NODE_FIELDMAP) || node.getType().equals(TikaConfig.NODE_KEEPMETADATA))
+        if (node.getType().equals(TikaConfig.NODE_FIELDMAP)
+          || node.getType().equals(TikaConfig.NODE_KEEPMETADATA)
+          || node.getType().equals(TikaConfig.NODE_LOWERNAMES))
           os.removeChild(i);
         else
           i++;
@@ -496,6 +511,18 @@ public class TikaExtractor extends org.a
       }
       // Add the new keepallmetadata config parameter 
       os.addChild(os.getChildCount(), node);
+      
+      SpecificationNode node2 = new SpecificationNode(TikaConfig.NODE_LOWERNAMES);
+      String lower = variableContext.getParameter(seqPrefix+"lowernames");
+      if (lower != null)
+      {
+        node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, lower);
+      }
+      else
+      {
+        node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false");
+      }
+      os.addChild(os.getChildCount(), node2);
     }
     
     if (variableContext.getParameter(seqPrefix+"ignoretikaexceptions_present") != null)
@@ -574,6 +601,7 @@ public class TikaExtractor extends org.a
     // Prep for field mappings
     List<Map<String,String>> fieldMappings = new ArrayList<Map<String,String>>();
     String keepAllMetadataValue = "true";
+    String lowernamesValue = "false";
     for (int i = 0; i < os.getChildCount(); i++)
     {
       SpecificationNode sn = os.getChild(i);
@@ -598,9 +626,14 @@ public class TikaExtractor extends org.a
       {
         keepAllMetadataValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
       }
+      else if (sn.getType().equals(TikaConfig.NODE_LOWERNAMES))
+      {
+        lowernamesValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+      }
     }
     paramMap.put("FIELDMAPPINGS",fieldMappings);
     paramMap.put("KEEPALLMETADATA",keepAllMetadataValue);
+    paramMap.put("LOWERNAMES",lowernamesValue);
   }
 
   protected static void fillInExceptionsSpecificationMap(Map<String,Object> paramMap, Specification os)
@@ -798,11 +831,13 @@ public class TikaExtractor extends org.a
     
     private final Map<String,String> sourceTargets = new HashMap<String,String>();
     private final boolean keepAllMetadata;
+    private final boolean lowerNames;
     private final boolean ignoreTikaException;
     private final String extractorClassName;
     
     public SpecPacker(Specification os) {
       boolean keepAllMetadata = true;
+      boolean lowerNames = false;
       boolean ignoreTikaException = true;
       String extractorClassName = null;
       for (int i = 0; i < os.getChildCount(); i++) {
@@ -811,6 +846,9 @@ public class TikaExtractor extends org.a
         if(sn.getType().equals(TikaConfig.NODE_KEEPMETADATA)) {
           String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
           keepAllMetadata = Boolean.parseBoolean(value);
+        } else if(sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) {
+          String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+          lowerNames = Boolean.parseBoolean(value);
         } else if (sn.getType().equals(TikaConfig.NODE_FIELDMAP)) {
           String source = sn.getAttributeValue(TikaConfig.ATTRIBUTE_SOURCE);
           String target = sn.getAttributeValue(TikaConfig.ATTRIBUTE_TARGET);
@@ -827,6 +865,7 @@ public class TikaExtractor extends org.a
         }
       }
       this.keepAllMetadata = keepAllMetadata;
+      this.lowerNames = lowerNames;
       this.ignoreTikaException = ignoreTikaException;
       this.extractorClassName = extractorClassName;
     }
@@ -860,7 +899,10 @@ public class TikaExtractor extends org.a
         sb.append('+');
       else
         sb.append('-');
-      
+      if (lowerNames)
+          sb.append('+');
+        else
+          sb.append('-');
       if (ignoreTikaException)
         sb.append('+');
       else
@@ -885,6 +927,10 @@ public class TikaExtractor extends org.a
       return keepAllMetadata;
     }
     
+    public boolean lowerNames() {
+      return lowerNames;
+    }
+    
     public boolean ignoreTikaException() {
       return ignoreTikaException;
     }

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties?rev=1688348&r1=1688347&r2=1688348&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties Tue Jun 30 03:34:24 2015
@@ -29,6 +29,7 @@ TikaExtractor.MetadataFieldName=Metadata
 TikaExtractor.FinalFieldName=Final field name
 TikaExtractor.NoFieldMappingSpecified=No field mapping specified
 TikaExtractor.KeepAllMetadata=Keep all metadata:
+TikaExtractor.LowerNames=Lower names:
 TikaExtractor.Add=Add
 TikaExtractor.AddFieldMapping=Add field mapping
 TikaExtractor.Delete=Delete

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties?rev=1688348&r1=1688347&r2=1688348&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties Tue Jun 30 03:34:24 2015
@@ -29,6 +29,7 @@ TikaExtractor.MetadataFieldName=メã
 TikaExtractor.FinalFieldName=最後のフィールド名
 TikaExtractor.NoFieldMappingSpecified=フィールドマッピングを指定してください
 TikaExtractor.KeepAllMetadata=全メタデータを保存:
+TikaExtractor.LowerNames=小文字名:
 TikaExtractor.Add=追加
 TikaExtractor.AddFieldMapping=フィールドマッピングを追加
 TikaExtractor.Delete=削除

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties?rev=1688348&r1=1688347&r2=1688348&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties Tue Jun 30 03:34:24 2015
@@ -29,6 +29,7 @@ TikaExtractor.MetadataFieldName=元æ
 TikaExtractor.FinalFieldName=最终字段名
 TikaExtractor.NoFieldMappingSpecified=未指定字段映射
 TikaExtractor.KeepAllMetadata=保存所有元数据:
+TikaExtractor.LowerNames=小写:
 TikaExtractor.Add=添加
 TikaExtractor.AddFieldMapping=添加字段映射
 TikaExtractor.Delete=删除

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html?rev=1688348&r1=1688347&r2=1688348&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html Tue Jun 30 03:34:24 2015
@@ -91,6 +91,17 @@
   #end
     </td>
   </tr>
+
+  <tr>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.LowerNames'))</nobr></td>
+    <td class="value">
+  #if($LOWERNAMES == 'true')
+       <input type="checkbox" checked="true" name="s${SEQNUM}_lowernames" value="true"/>
+  #else
+       <input type="checkbox" name="s${SEQNUM}_lowernames" value="true"/>
+  #end
+    </td>
+  </tr>
 </table>
       
 #else
@@ -103,5 +114,6 @@
   #end
 <input type="hidden" name="s${SEQNUM}_fieldmapping_count" value="$fieldcounter"/>
 <input type="hidden" name="s${SEQNUM}_keepallmetadata" value="$Encoder.bodyEscape($KEEPALLMETADATA)"/>
+<input type="hidden" name="s${SEQNUM}_lowernames" value="$Encoder.bodyEscape($LOWERNAMES)"/>
 
 #end
\ No newline at end of file

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html?rev=1688348&r1=1688347&r2=1688348&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html Tue Jun 30 03:34:24 2015
@@ -53,6 +53,11 @@
   </tr>
   <tr><td class="separator" colspan="2"><hr/></td></tr>
   <tr>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.LowerNames'))</nobr></td>
+    <td class="value"><nobr>$Encoder.bodyEscape($LOWERNAMES)</nobr></td>
+  </tr>
+  <tr><td class="separator" colspan="2"><hr/></td></tr>
+  <tr>
     <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.IgnoreTikaExceptions'))</nobr></td>
     <td class="value"><nobr>$Encoder.bodyEscape($IGNORETIKAEXCEPTIONS)</nobr></td>
   </tr>