You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2015/06/30 08:15:26 UTC

svn commit: r1688359 - in /manifoldcf/branches/dev_1x: ./ connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/ connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/ connec...

Author: kwright
Date: Tue Jun 30 06:15:25 2015
New Revision: 1688359

URL: http://svn.apache.org/r1688359
Log:
Pull up fix for CONNECTORS-1218 from trunk.

Modified:
    manifoldcf/branches/dev_1x/   (props changed)
    manifoldcf/branches/dev_1x/CHANGES.txt
    manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
    manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
    manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
    manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
    manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
    manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html
    manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html

Propchange: manifoldcf/branches/dev_1x/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue Jun 30 06:15:25 2015
@@ -124,4 +124,4 @@
 /manifoldcf/branches/CONNECTORS-981:1605049-1605773
 /manifoldcf/branches/CONNECTORS-989:1611600-1612101
 /manifoldcf/branches/CONNECTORS-990:1610284-1610707
-/manifoldcf/trunk:1620703,1620748,1620812,1620862,1621449,1621613,1621855,1622213,1622740,1622850,1622853-1622854,1623249,1623251,1623314,1623599,1623951,1623953-1623954,1623956,1623972,1624058,1624085,1624174,1624236,1624377,1624384,1624399,1624449,1624464,1624504,1624729-1624731,1624906,1624909-1624910,1624982,1625023,1625095,1625103,1625108,1625264,1625270,1625394,1625400,1625910,1626090,1626097,1626102,1626638-1626639,1626973,1627687,1627690,1627959,1628046,1628066,1628106,1628168,1628188,1628699,1628798,1628808,1628845,1628905,1629122,1629374-1629375,1629379,1629541,1629994,1630188,1630535,1630623,1630671,1630812,1630885,1631039,1631162,1631164,1631252,1631750,1631953,1632013,1632225,1632289,1632562,1632844,1632847,1632854,1633062-1633063,1633108,1633193,1633202,1633282,1633284,1633295,1633336,1633339,1633345,1633348,1633364,1633378,1633383,1633432,1633546,1633590,1633634,1633668,1633727,1633760,1633764,1633786,1633910,1633923,1634021,1634028,1634067,1634132,1634145,1634148,163
 4155,1634188,1634202,1634264,1634373,1634530,1634688,1634850,1634857,1635103,1635116,1635421,1635438,1635478,1635481,1635484,1635490,1635809,1635939,1636146,1636167,1636180,1636207,1636215,1636232,1636334,1636519,1636570,1636684,1636940,1637011,1637310,1637350,1637364,1637373,1637378,1639259,1639593,1639600,1640018,1640101,1640199,1640204,1640208,1640314,1640319,1640749,1640772,1640805,1640888,1640925,1640941-1640942,1641222,1641328,1641557,1641559,1641629,1641633,1641724,1641754,1641911,1642163,1642255,1642318,1642531,1642650,1642658,1642673,1642716,1644197,1644399,1644538,1644920,1644931,1646317,1646397,1646403,1646408,1646640,1646947,1647574,1647585,1647608,1648686,1648976,1649201,1649203,1649529,1649605,1649628,1649794,1650351,1650722,1650741-1650742,1650745,1650747,1650911,1650954,1651332,1651539,1651907,1651921,1652071,1652974,1653175,1653899,1654651,1655205,1655261,1655264,1655377,1655411,1655618,1655914,1657346,1657443,1658004,1658036,1658121,1658155,1658188,1658463,1658476,
 1660258,1660276,1661454,1665848,1666160,1666781,1666820,1668312,1669100,1669238,1669487,1669523,1669586,1669660,1670614,1670625,1670715,1671496,1672169,1672301,1672616,1672737,1673559,1673573,1673579,1673722,1675781,1675898,1676094,1676882,1676910,1678300,1678329,1678471,1678551,1679730,1679826,1681390,1681735,1682232,1682252,1682410,1682602,1682622,1682719,1683208,1683506,1683529,1683768,1684015,1684017,1684153,1684156,1684712,1684866,1685547,1687097,1687303,1687768,1688070
+/manifoldcf/trunk:1620703,1620748,1620812,1620862,1621449,1621613,1621855,1622213,1622740,1622850,1622853-1622854,1623249,1623251,1623314,1623599,1623951,1623953-1623954,1623956,1623972,1624058,1624085,1624174,1624236,1624377,1624384,1624399,1624449,1624464,1624504,1624729-1624731,1624906,1624909-1624910,1624982,1625023,1625095,1625103,1625108,1625264,1625270,1625394,1625400,1625910,1626090,1626097,1626102,1626638-1626639,1626973,1627687,1627690,1627959,1628046,1628066,1628106,1628168,1628188,1628699,1628798,1628808,1628845,1628905,1629122,1629374-1629375,1629379,1629541,1629994,1630188,1630535,1630623,1630671,1630812,1630885,1631039,1631162,1631164,1631252,1631750,1631953,1632013,1632225,1632289,1632562,1632844,1632847,1632854,1633062-1633063,1633108,1633193,1633202,1633282,1633284,1633295,1633336,1633339,1633345,1633348,1633364,1633378,1633383,1633432,1633546,1633590,1633634,1633668,1633727,1633760,1633764,1633786,1633910,1633923,1634021,1634028,1634067,1634132,1634145,1634148,163
 4155,1634188,1634202,1634264,1634373,1634530,1634688,1634850,1634857,1635103,1635116,1635421,1635438,1635478,1635481,1635484,1635490,1635809,1635939,1636146,1636167,1636180,1636207,1636215,1636232,1636334,1636519,1636570,1636684,1636940,1637011,1637310,1637350,1637364,1637373,1637378,1639259,1639593,1639600,1640018,1640101,1640199,1640204,1640208,1640314,1640319,1640749,1640772,1640805,1640888,1640925,1640941-1640942,1641222,1641328,1641557,1641559,1641629,1641633,1641724,1641754,1641911,1642163,1642255,1642318,1642531,1642650,1642658,1642673,1642716,1644197,1644399,1644538,1644920,1644931,1646317,1646397,1646403,1646408,1646640,1646947,1647574,1647585,1647608,1648686,1648976,1649201,1649203,1649529,1649605,1649628,1649794,1650351,1650722,1650741-1650742,1650745,1650747,1650911,1650954,1651332,1651539,1651907,1651921,1652071,1652974,1653175,1653899,1654651,1655205,1655261,1655264,1655377,1655411,1655618,1655914,1657346,1657443,1658004,1658036,1658121,1658155,1658188,1658463,1658476,
 1660258,1660276,1661454,1665848,1666160,1666781,1666820,1668312,1669100,1669238,1669487,1669523,1669586,1669660,1670614,1670625,1670715,1671496,1672169,1672301,1672616,1672737,1673559,1673573,1673579,1673722,1675781,1675898,1676094,1676882,1676910,1678300,1678329,1678471,1678551,1679730,1679826,1681390,1681735,1682232,1682252,1682410,1682602,1682622,1682719,1683208,1683506,1683529,1683768,1684015,1684017,1684153,1684156,1684712,1684866,1685547,1687097,1687303,1687768,1688070,1688348

Modified: manifoldcf/branches/dev_1x/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/CHANGES.txt?rev=1688359&r1=1688358&r2=1688359&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/CHANGES.txt (original)
+++ manifoldcf/branches/dev_1x/CHANGES.txt Tue Jun 30 06:15:25 2015
@@ -3,6 +3,9 @@ $Id$
 
 ======================= 1.10-dev =====================
 
+CONNECTORS-1218: Add lowerNames option on Tika extractor.
+(Shinichiro Abe)
+
 CONNECTORS-1217: Fix documentation for api login parameters.
 (Karl Wright)
 

Modified: manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java?rev=1688359&r1=1688358&r2=1688359&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java (original)
+++ manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java Tue Jun 30 06:15:25 2015
@@ -29,6 +29,7 @@ public class TikaConfig {
   // Specification nodes and values
   public static final String NODE_FIELDMAP = "fieldmap";
   public static final String NODE_KEEPMETADATA = "keepAllMetadata";
+  public static final String NODE_LOWERNAMES = "lowerNames";
   public static final String NODE_IGNORETIKAEXCEPTION = "ignoreException";
   public static final String NODE_BOILERPLATEPROCESSOR = "boilerplateprocessor";
   public static final String ATTRIBUTE_SOURCE = "source";

Modified: manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java?rev=1688359&r1=1688358&r2=1688359&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java (original)
+++ manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java Tue Jun 30 06:15:25 2015
@@ -33,7 +33,9 @@ import org.apache.tika.parser.AutoDetect
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.metadata.TikaMetadataKeys;
 import org.apache.tika.parser.html.BoilerpipeContentHandler;
+
 import de.l3s.boilerpipe.BoilerpipeExtractor;
+
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -306,6 +308,17 @@ public class TikaExtractor extends org.a
         String[] metaNames = metadata.names();
         for(String mName : metaNames){
           String value = metadata.get(mName);
+          if (sp.lowerNames())
+          {
+            StringBuilder sb = new StringBuilder();
+            for (int i=0; i<mName.length(); i++) {
+              char ch = mName.charAt(i);
+              if (!Character.isLetterOrDigit(ch)) ch='_';
+              else ch=Character.toLowerCase(ch);
+              sb.append(ch);
+            }
+            mName = sb.toString();
+          }
           String target = sp.getMapping(mName);
           if(target!=null)
           {
@@ -443,7 +456,9 @@ public class TikaExtractor extends org.a
       while (i < os.getChildCount())
       {
         SpecificationNode node = os.getChild(i);
-        if (node.getType().equals(TikaConfig.NODE_FIELDMAP) || node.getType().equals(TikaConfig.NODE_KEEPMETADATA))
+        if (node.getType().equals(TikaConfig.NODE_FIELDMAP)
+          || node.getType().equals(TikaConfig.NODE_KEEPMETADATA)
+          || node.getType().equals(TikaConfig.NODE_LOWERNAMES))
           os.removeChild(i);
         else
           i++;
@@ -496,6 +511,18 @@ public class TikaExtractor extends org.a
       }
       // Add the new keepallmetadata config parameter 
       os.addChild(os.getChildCount(), node);
+      
+      SpecificationNode node2 = new SpecificationNode(TikaConfig.NODE_LOWERNAMES);
+      String lower = variableContext.getParameter(seqPrefix+"lowernames");
+      if (lower != null)
+      {
+        node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, lower);
+      }
+      else
+      {
+        node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false");
+      }
+      os.addChild(os.getChildCount(), node2);
     }
     
     if (variableContext.getParameter(seqPrefix+"ignoretikaexceptions_present") != null)
@@ -574,6 +601,7 @@ public class TikaExtractor extends org.a
     // Prep for field mappings
     List<Map<String,String>> fieldMappings = new ArrayList<Map<String,String>>();
     String keepAllMetadataValue = "true";
+    String lowernamesValue = "false";
     for (int i = 0; i < os.getChildCount(); i++)
     {
       SpecificationNode sn = os.getChild(i);
@@ -598,9 +626,14 @@ public class TikaExtractor extends org.a
       {
         keepAllMetadataValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
       }
+      else if (sn.getType().equals(TikaConfig.NODE_LOWERNAMES))
+      {
+        lowernamesValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+      }
     }
     paramMap.put("FIELDMAPPINGS",fieldMappings);
     paramMap.put("KEEPALLMETADATA",keepAllMetadataValue);
+    paramMap.put("LOWERNAMES",lowernamesValue);
   }
 
   protected static void fillInExceptionsSpecificationMap(Map<String,Object> paramMap, Specification os)
@@ -798,11 +831,13 @@ public class TikaExtractor extends org.a
     
     private final Map<String,String> sourceTargets = new HashMap<String,String>();
     private final boolean keepAllMetadata;
+    private final boolean lowerNames;
     private final boolean ignoreTikaException;
     private final String extractorClassName;
     
     public SpecPacker(Specification os) {
       boolean keepAllMetadata = true;
+      boolean lowerNames = false;
       boolean ignoreTikaException = true;
       String extractorClassName = null;
       for (int i = 0; i < os.getChildCount(); i++) {
@@ -811,6 +846,9 @@ public class TikaExtractor extends org.a
         if(sn.getType().equals(TikaConfig.NODE_KEEPMETADATA)) {
           String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
           keepAllMetadata = Boolean.parseBoolean(value);
+        } else if(sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) {
+          String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+          lowerNames = Boolean.parseBoolean(value);
         } else if (sn.getType().equals(TikaConfig.NODE_FIELDMAP)) {
           String source = sn.getAttributeValue(TikaConfig.ATTRIBUTE_SOURCE);
           String target = sn.getAttributeValue(TikaConfig.ATTRIBUTE_TARGET);
@@ -827,6 +865,7 @@ public class TikaExtractor extends org.a
         }
       }
       this.keepAllMetadata = keepAllMetadata;
+      this.lowerNames = lowerNames;
       this.ignoreTikaException = ignoreTikaException;
       this.extractorClassName = extractorClassName;
     }
@@ -860,7 +899,10 @@ public class TikaExtractor extends org.a
         sb.append('+');
       else
         sb.append('-');
-      
+      if (lowerNames)
+          sb.append('+');
+        else
+          sb.append('-');
       if (ignoreTikaException)
         sb.append('+');
       else
@@ -885,6 +927,10 @@ public class TikaExtractor extends org.a
       return keepAllMetadata;
     }
     
+    public boolean lowerNames() {
+      return lowerNames;
+    }
+    
     public boolean ignoreTikaException() {
       return ignoreTikaException;
     }

Modified: manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties?rev=1688359&r1=1688358&r2=1688359&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties (original)
+++ manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties Tue Jun 30 06:15:25 2015
@@ -29,6 +29,7 @@ TikaExtractor.MetadataFieldName=Metadata
 TikaExtractor.FinalFieldName=Final field name
 TikaExtractor.NoFieldMappingSpecified=No field mapping specified
 TikaExtractor.KeepAllMetadata=Keep all metadata:
+TikaExtractor.LowerNames=Lower names:
 TikaExtractor.Add=Add
 TikaExtractor.AddFieldMapping=Add field mapping
 TikaExtractor.Delete=Delete

Modified: manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties?rev=1688359&r1=1688358&r2=1688359&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties (original)
+++ manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties Tue Jun 30 06:15:25 2015
@@ -29,6 +29,7 @@ TikaExtractor.MetadataFieldName=メã
 TikaExtractor.FinalFieldName=最後のフィールド名
 TikaExtractor.NoFieldMappingSpecified=フィールドマッピングを指定してください
 TikaExtractor.KeepAllMetadata=全メタデータを保存:
+TikaExtractor.LowerNames=小文字名:
 TikaExtractor.Add=追加
 TikaExtractor.AddFieldMapping=フィールドマッピングを追加
 TikaExtractor.Delete=削除

Modified: manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties?rev=1688359&r1=1688358&r2=1688359&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties (original)
+++ manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties Tue Jun 30 06:15:25 2015
@@ -29,6 +29,7 @@ TikaExtractor.MetadataFieldName=元æ
 TikaExtractor.FinalFieldName=最终字段名
 TikaExtractor.NoFieldMappingSpecified=未指定字段映射
 TikaExtractor.KeepAllMetadata=保存所有元数据:
+TikaExtractor.LowerNames=小写:
 TikaExtractor.Add=添加
 TikaExtractor.AddFieldMapping=添加字段映射
 TikaExtractor.Delete=删除

Modified: manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html?rev=1688359&r1=1688358&r2=1688359&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html (original)
+++ manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html Tue Jun 30 06:15:25 2015
@@ -91,6 +91,17 @@
   #end
     </td>
   </tr>
+
+  <tr>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.LowerNames'))</nobr></td>
+    <td class="value">
+  #if($LOWERNAMES == 'true')
+       <input type="checkbox" checked="true" name="s${SEQNUM}_lowernames" value="true"/>
+  #else
+       <input type="checkbox" name="s${SEQNUM}_lowernames" value="true"/>
+  #end
+    </td>
+  </tr>
 </table>
       
 #else
@@ -103,5 +114,6 @@
   #end
 <input type="hidden" name="s${SEQNUM}_fieldmapping_count" value="$fieldcounter"/>
 <input type="hidden" name="s${SEQNUM}_keepallmetadata" value="$Encoder.bodyEscape($KEEPALLMETADATA)"/>
+<input type="hidden" name="s${SEQNUM}_lowernames" value="$Encoder.bodyEscape($LOWERNAMES)"/>
 
 #end
\ No newline at end of file

Modified: manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html?rev=1688359&r1=1688358&r2=1688359&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html (original)
+++ manifoldcf/branches/dev_1x/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html Tue Jun 30 06:15:25 2015
@@ -53,6 +53,11 @@
   </tr>
   <tr><td class="separator" colspan="2"><hr/></td></tr>
   <tr>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.LowerNames'))</nobr></td>
+    <td class="value"><nobr>$Encoder.bodyEscape($LOWERNAMES)</nobr></td>
+  </tr>
+  <tr><td class="separator" colspan="2"><hr/></td></tr>
+  <tr>
     <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.IgnoreTikaExceptions'))</nobr></td>
     <td class="value"><nobr>$Encoder.bodyEscape($IGNORETIKAEXCEPTIONS)</nobr></td>
   </tr>