You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by sh...@apache.org on 2015/06/30 05:34:25 UTC
svn commit: r1688348 - in /manifoldcf/trunk: ./
connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/
connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/
connectors/tika/...
Author: shinichiro
Date: Tue Jun 30 03:34:24 2015
New Revision: 1688348
URL: http://svn.apache.org/r1688348
Log:
Fix for CONNECTORS-1218
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html
manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1688348&r1=1688347&r2=1688348&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue Jun 30 03:34:24 2015
@@ -3,6 +3,9 @@ $Id$
======================= 2.2-dev =====================
+CONNECTORS-1218: Add lowerNames option on Tika extractor.
+(Shinichiro Abe)
+
CONNECTORS-1217: Fix documentation for api login parameters.
(Karl Wright)
Modified: manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java?rev=1688348&r1=1688347&r2=1688348&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java Tue Jun 30 03:34:24 2015
@@ -29,6 +29,7 @@ public class TikaConfig {
// Specification nodes and values
public static final String NODE_FIELDMAP = "fieldmap";
public static final String NODE_KEEPMETADATA = "keepAllMetadata";
+ public static final String NODE_LOWERNAMES = "lowerNames";
public static final String NODE_IGNORETIKAEXCEPTION = "ignoreException";
public static final String NODE_BOILERPLATEPROCESSOR = "boilerplateprocessor";
public static final String ATTRIBUTE_SOURCE = "source";
Modified: manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java?rev=1688348&r1=1688347&r2=1688348&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java Tue Jun 30 03:34:24 2015
@@ -33,7 +33,9 @@ import org.apache.tika.parser.AutoDetect
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
+
import de.l3s.boilerpipe.BoilerpipeExtractor;
+
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -306,6 +308,17 @@ public class TikaExtractor extends org.a
String[] metaNames = metadata.names();
for(String mName : metaNames){
String value = metadata.get(mName);
+ if (sp.lowerNames())
+ {
+ StringBuilder sb = new StringBuilder();
+ for (int i=0; i<mName.length(); i++) {
+ char ch = mName.charAt(i);
+ if (!Character.isLetterOrDigit(ch)) ch='_';
+ else ch=Character.toLowerCase(ch);
+ sb.append(ch);
+ }
+ mName = sb.toString();
+ }
String target = sp.getMapping(mName);
if(target!=null)
{
@@ -443,7 +456,9 @@ public class TikaExtractor extends org.a
while (i < os.getChildCount())
{
SpecificationNode node = os.getChild(i);
- if (node.getType().equals(TikaConfig.NODE_FIELDMAP) || node.getType().equals(TikaConfig.NODE_KEEPMETADATA))
+ if (node.getType().equals(TikaConfig.NODE_FIELDMAP)
+ || node.getType().equals(TikaConfig.NODE_KEEPMETADATA)
+ || node.getType().equals(TikaConfig.NODE_LOWERNAMES))
os.removeChild(i);
else
i++;
@@ -496,6 +511,18 @@ public class TikaExtractor extends org.a
}
// Add the new keepallmetadata config parameter
os.addChild(os.getChildCount(), node);
+
+ SpecificationNode node2 = new SpecificationNode(TikaConfig.NODE_LOWERNAMES);
+ String lower = variableContext.getParameter(seqPrefix+"lowernames");
+ if (lower != null)
+ {
+ node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, lower);
+ }
+ else
+ {
+ node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false");
+ }
+ os.addChild(os.getChildCount(), node2);
}
if (variableContext.getParameter(seqPrefix+"ignoretikaexceptions_present") != null)
@@ -574,6 +601,7 @@ public class TikaExtractor extends org.a
// Prep for field mappings
List<Map<String,String>> fieldMappings = new ArrayList<Map<String,String>>();
String keepAllMetadataValue = "true";
+ String lowernamesValue = "false";
for (int i = 0; i < os.getChildCount(); i++)
{
SpecificationNode sn = os.getChild(i);
@@ -598,9 +626,14 @@ public class TikaExtractor extends org.a
{
keepAllMetadataValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
}
+ else if (sn.getType().equals(TikaConfig.NODE_LOWERNAMES))
+ {
+ lowernamesValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+ }
}
paramMap.put("FIELDMAPPINGS",fieldMappings);
paramMap.put("KEEPALLMETADATA",keepAllMetadataValue);
+ paramMap.put("LOWERNAMES",lowernamesValue);
}
protected static void fillInExceptionsSpecificationMap(Map<String,Object> paramMap, Specification os)
@@ -798,11 +831,13 @@ public class TikaExtractor extends org.a
private final Map<String,String> sourceTargets = new HashMap<String,String>();
private final boolean keepAllMetadata;
+ private final boolean lowerNames;
private final boolean ignoreTikaException;
private final String extractorClassName;
public SpecPacker(Specification os) {
boolean keepAllMetadata = true;
+ boolean lowerNames = false;
boolean ignoreTikaException = true;
String extractorClassName = null;
for (int i = 0; i < os.getChildCount(); i++) {
@@ -811,6 +846,9 @@ public class TikaExtractor extends org.a
if(sn.getType().equals(TikaConfig.NODE_KEEPMETADATA)) {
String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
keepAllMetadata = Boolean.parseBoolean(value);
+ } else if(sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) {
+ String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+ lowerNames = Boolean.parseBoolean(value);
} else if (sn.getType().equals(TikaConfig.NODE_FIELDMAP)) {
String source = sn.getAttributeValue(TikaConfig.ATTRIBUTE_SOURCE);
String target = sn.getAttributeValue(TikaConfig.ATTRIBUTE_TARGET);
@@ -827,6 +865,7 @@ public class TikaExtractor extends org.a
}
}
this.keepAllMetadata = keepAllMetadata;
+ this.lowerNames = lowerNames;
this.ignoreTikaException = ignoreTikaException;
this.extractorClassName = extractorClassName;
}
@@ -860,7 +899,10 @@ public class TikaExtractor extends org.a
sb.append('+');
else
sb.append('-');
-
+ if (lowerNames)
+ sb.append('+');
+ else
+ sb.append('-');
if (ignoreTikaException)
sb.append('+');
else
@@ -885,6 +927,10 @@ public class TikaExtractor extends org.a
return keepAllMetadata;
}
+ public boolean lowerNames() {
+ return lowerNames;
+ }
+
public boolean ignoreTikaException() {
return ignoreTikaException;
}
Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties?rev=1688348&r1=1688347&r2=1688348&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties Tue Jun 30 03:34:24 2015
@@ -29,6 +29,7 @@ TikaExtractor.MetadataFieldName=Metadata
TikaExtractor.FinalFieldName=Final field name
TikaExtractor.NoFieldMappingSpecified=No field mapping specified
TikaExtractor.KeepAllMetadata=Keep all metadata:
+TikaExtractor.LowerNames=Lower names:
TikaExtractor.Add=Add
TikaExtractor.AddFieldMapping=Add field mapping
TikaExtractor.Delete=Delete
Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties?rev=1688348&r1=1688347&r2=1688348&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties Tue Jun 30 03:34:24 2015
@@ -29,6 +29,7 @@ TikaExtractor.MetadataFieldName=ã¡ã
TikaExtractor.FinalFieldName=æå¾ã®ãã£ã¼ã«ãå
TikaExtractor.NoFieldMappingSpecified=ãã£ã¼ã«ããããã³ã°ãæå®ãã¦ãã ãã
TikaExtractor.KeepAllMetadata=å
¨ã¡ã¿ãã¼ã¿ãä¿å:
+TikaExtractor.LowerNames=å°æåå:
TikaExtractor.Add=追å
TikaExtractor.AddFieldMapping=ãã£ã¼ã«ããããã³ã°ã追å
TikaExtractor.Delete=åé¤
Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties?rev=1688348&r1=1688347&r2=1688348&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties Tue Jun 30 03:34:24 2015
@@ -29,6 +29,7 @@ TikaExtractor.MetadataFieldName=å
æ
TikaExtractor.FinalFieldName=æç»å段å
TikaExtractor.NoFieldMappingSpecified=æªæå®å段æ å°
TikaExtractor.KeepAllMetadata=ä¿åææå
æ°æ®:
+TikaExtractor.LowerNames=å°å:
TikaExtractor.Add=æ·»å
TikaExtractor.AddFieldMapping=æ·»å å段æ å°
TikaExtractor.Delete=å é¤
Modified: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html?rev=1688348&r1=1688347&r2=1688348&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html Tue Jun 30 03:34:24 2015
@@ -91,6 +91,17 @@
#end
</td>
</tr>
+
+ <tr>
+ <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.LowerNames'))</nobr></td>
+ <td class="value">
+ #if($LOWERNAMES == 'true')
+ <input type="checkbox" checked="true" name="s${SEQNUM}_lowernames" value="true"/>
+ #else
+ <input type="checkbox" name="s${SEQNUM}_lowernames" value="true"/>
+ #end
+ </td>
+ </tr>
</table>
#else
@@ -103,5 +114,6 @@
#end
<input type="hidden" name="s${SEQNUM}_fieldmapping_count" value="$fieldcounter"/>
<input type="hidden" name="s${SEQNUM}_keepallmetadata" value="$Encoder.bodyEscape($KEEPALLMETADATA)"/>
+<input type="hidden" name="s${SEQNUM}_lowernames" value="$Encoder.bodyEscape($LOWERNAMES)"/>
#end
\ No newline at end of file
Modified: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html?rev=1688348&r1=1688347&r2=1688348&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html Tue Jun 30 03:34:24 2015
@@ -53,6 +53,11 @@
</tr>
<tr><td class="separator" colspan="2"><hr/></td></tr>
<tr>
+ <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.LowerNames'))</nobr></td>
+ <td class="value"><nobr>$Encoder.bodyEscape($LOWERNAMES)</nobr></td>
+ </tr>
+ <tr><td class="separator" colspan="2"><hr/></td></tr>
+ <tr>
<td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.IgnoreTikaExceptions'))</nobr></td>
<td class="value"><nobr>$Encoder.bodyEscape($IGNORETIKAEXCEPTIONS)</nobr></td>
</tr>