You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/10/10 02:11:31 UTC
svn commit: r1630623 - in /manifoldcf/trunk: ./
connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation/documentfilter/
connectors/documentfilter/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transfo...
Author: kwright
Date: Fri Oct 10 00:11:30 2014
New Revision: 1630623
URL: http://svn.apache.org/r1630623
Log:
Fix for CONNECTORS-1068.
Modified:
manifoldcf/trunk/ (props changed)
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation/documentfilter/DocumentFilter.java
manifoldcf/trunk/connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation/documentfilter/DocumentFilterConfig.java
manifoldcf/trunk/connectors/documentfilter/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/documentfilter/common_en_US.properties
manifoldcf/trunk/connectors/documentfilter/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/documentfilter/common_ja_JP.properties
manifoldcf/trunk/connectors/documentfilter/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/documentfilter/common_zh_CN.properties
manifoldcf/trunk/connectors/documentfilter/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/documentfilter/editSpecification_Contents.html
manifoldcf/trunk/connectors/documentfilter/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/documentfilter/viewSpecification.html
manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
Propchange: manifoldcf/trunk/
------------------------------------------------------------------------------
Merged /manifoldcf/branches/CONNECTORS-1068:r1630245-1630621
Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1630623&r1=1630622&r2=1630623&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Oct 10 00:11:30 2014
@@ -3,6 +3,10 @@ $Id$
======================= 2.0-dev =====================
+CONNECTORS-1068: Enhancements for Document Filter transformation
+connector.
+(Karl Wright)
+
CONNECTORS-1070: Exit immediately upon finding a misconfigured
ManifoldCF.
(Kamil Żyta, Karl Wright)
Modified: manifoldcf/trunk/connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation/documentfilter/DocumentFilter.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation/documentfilter/DocumentFilter.java?rev=1630623&r1=1630622&r2=1630623&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation/documentfilter/DocumentFilter.java (original)
+++ manifoldcf/trunk/connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation/documentfilter/DocumentFilter.java Fri Oct 10 00:11:30 2014
@@ -63,6 +63,29 @@ public class DocumentFilter extends org.
return new VersionContext(sp.toPackedString(),params,os);
}
+ /** Detect if a document date is acceptable or not. This method is used to determine whether it makes sense to fetch a document
+ * in the first place.
+ *@param outputDescription is the document's output version.
+ *@param date is the date of the document.
+ *@param activities is an object including the activities that can be performed by this method.
+ *@return true if the document with that date can be accepted by this connector.
+ */
+ @Override
+ public boolean checkDateIndexable(VersionContext outputDescription, Date date, IOutputCheckActivity activities)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ SpecPacker sp = new SpecPacker(outputDescription.getSpecification());
+ return checkDateIndexable(sp, outputDescription, date, activities);
+ }
+
+ protected boolean checkDateIndexable(SpecPacker sp, VersionContext outputDescription, Date date, IOutputCheckActivity activities)
+ throws ManifoldCFException, ServiceInterruption {
+ if (sp.checkDate(date))
+ return super.checkDateIndexable(outputDescription, date, activities);
+ else
+ return false;
+ }
+
/** Detect if a mime type is indexable or not. This method is used by participating repository connectors to pre-filter the number of
* unusable documents that will be passed to this output connector.
*@param outputDescription is the document's output version.
@@ -74,6 +97,11 @@ public class DocumentFilter extends org.
throws ManifoldCFException, ServiceInterruption
{
SpecPacker sp = new SpecPacker(outputDescription.getSpecification());
+ return checkMimeTypeIndexable(sp, outputDescription, mimeType, activities);
+ }
+
+ protected boolean checkMimeTypeIndexable(SpecPacker sp, VersionContext outputDescription, String mimeType, IOutputCheckActivity activities)
+ throws ManifoldCFException, ServiceInterruption {
if (sp.checkMimeType(mimeType))
return super.checkMimeTypeIndexable(outputDescription, mimeType, activities);
else
@@ -84,6 +112,11 @@ public class DocumentFilter extends org.
public boolean checkLengthIndexable(VersionContext outputDescription, long length, IOutputCheckActivity activities)
throws ManifoldCFException, ServiceInterruption {
SpecPacker sp = new SpecPacker(outputDescription.getSpecification());
+ return checkLengthIndexable(sp, outputDescription, length, activities);
+ }
+
+ protected boolean checkLengthIndexable(SpecPacker sp, VersionContext outputDescription, long length, IOutputCheckActivity activities)
+ throws ManifoldCFException, ServiceInterruption {
if (sp.checkLengthIndexable(length))
return super.checkLengthIndexable(outputDescription, length, activities);
else
@@ -94,6 +127,11 @@ public class DocumentFilter extends org.
public boolean checkURLIndexable(VersionContext outputDescription, String url, IOutputCheckActivity activities)
throws ManifoldCFException, ServiceInterruption {
SpecPacker sp = new SpecPacker(outputDescription.getSpecification());
+ return checkURLIndexable(sp, outputDescription, url, activities);
+ }
+
+ protected boolean checkURLIndexable(SpecPacker sp, VersionContext outputDescription, String url, IOutputCheckActivity activities)
+ throws ManifoldCFException, ServiceInterruption {
if (sp.checkURLIndexable(url))
return super.checkURLIndexable(outputDescription, url, activities);
else
@@ -103,9 +141,6 @@ public class DocumentFilter extends org.
/** Add (or replace) a document in the output data store using the connector.
* This method presumes that the connector object has been configured, and it is thus able to communicate with the output data store should that be
* necessary.
- * The OutputSpecification is *not* provided to this method, because the goal is consistency, and if output is done it must be consistent with the
- * output description, since that was what was partly used to determine if output should be taking place. So it may be necessary for this method to decode
- * an output description string in order to determine what should be done.
*@param documentURI is the URI of the document. The URI is presumed to be the unique identifier which the output data store will use to process
* and serve the document. This URI is constructed by the repository connector which fetches the document, and is thus universal across all output connectors.
*@param outputDescription is the description string that was constructed for this document by the getOutputDescription() method.
@@ -118,6 +153,15 @@ public class DocumentFilter extends org.
public int addOrReplaceDocumentWithException(String documentURI, VersionContext outputDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
throws ManifoldCFException, ServiceInterruption, IOException
{
+ // Hard filtering (in case connectors don't call check methods above)
+ SpecPacker sp = new SpecPacker(outputDescription.getSpecification());
+ if (!checkURLIndexable(sp, outputDescription, documentURI, activities) ||
+ !checkLengthIndexable(sp, outputDescription, document.getBinaryLength(), activities) ||
+ !checkMimeTypeIndexable(sp, outputDescription, document.getMimeType(), activities) ||
+ !checkDateIndexable(sp, outputDescription, document.getModifiedDate(), activities)) {
+ activities.noDocument();
+ return DOCUMENTSTATUS_REJECTED;
+ }
return activities.sendDocument(documentURI, document);
}
@@ -127,6 +171,7 @@ public class DocumentFilter extends org.
String maxFileSize = DocumentFilterConfig.MAXLENGTH_DEFAULT;
String allowedMimeTypes = DocumentFilterConfig.MIMETYPES_DEFAULT;
String allowedFileExtensions = DocumentFilterConfig.EXTENSIONS_DEFAULT;
+ Long minDate = null;
for (int i = 0; i < os.getChildCount(); i++)
{
SpecificationNode sn = os.getChild(i);
@@ -138,11 +183,21 @@ public class DocumentFilter extends org.
allowedMimeTypes = sn.getValue();
else if (sn.getType().equals(DocumentFilterConfig.NODE_EXTENSIONS))
allowedFileExtensions = sn.getValue();
+ else if (sn.getType().equals(DocumentFilterConfig.NODE_MINDATE))
+ minDate = new Long(sn.getAttributeValue(DocumentFilterConfig.ATTRIBUTE_VALUE));
}
paramMap.put("MINFILESIZE",minFileSize);
paramMap.put("MAXFILESIZE",maxFileSize);
paramMap.put("MIMETYPES",allowedMimeTypes);
paramMap.put("EXTENSIONS",allowedFileExtensions);
+
+ Calendar c = new GregorianCalendar();
+ c.setTimeInMillis((minDate==null)?0L:minDate.longValue());
+ paramMap.put("MINDATEYEAR",Integer.toString(c.get(Calendar.YEAR)));
+ paramMap.put("MINDATEMONTH",Integer.toString(c.get(Calendar.MONTH)));
+ paramMap.put("MINDATEDAY",Integer.toString(c.get(Calendar.DAY_OF_MONTH)));
+ paramMap.put("MINDATEHOUR",Integer.toString(c.get(Calendar.HOUR_OF_DAY)));
+ paramMap.put("MINDATEMINUTE",String.format("%02d",c.get(Calendar.MINUTE)));
}
/** Obtain the name of the form check javascript method to call.
@@ -234,6 +289,36 @@ public class DocumentFilter extends org.
throws ManifoldCFException {
String seqPrefix = "s"+connectionSequenceNumber+"_";
+ String minDateYear = variableContext.getParameter(seqPrefix+"mindateyear");
+ String minDateMonth = variableContext.getParameter(seqPrefix+"mindatemonth");
+ String minDateDay = variableContext.getParameter(seqPrefix + "mindateday");
+ String minDateHour = variableContext.getParameter(seqPrefix + "mindatehour");
+ String minDateMinute = variableContext.getParameter(seqPrefix + "mindateminute");
+ if (minDateYear != null && minDateMonth != null && minDateDay != null && minDateHour != null && minDateMinute != null)
+ {
+ Calendar c = new GregorianCalendar();
+ try
+ {
+ c.set(Integer.parseInt(minDateYear),Integer.parseInt(minDateMonth),Integer.parseInt(minDateDay),Integer.parseInt(minDateHour),Integer.parseInt(minDateMinute));
+ }
+ catch (Exception e)
+ {
+ }
+ long theTime = c.getTimeInMillis();
+ int i = 0;
+ while (i < os.getChildCount())
+ {
+ SpecificationNode node = os.getChild(i);
+ if (node.getType().equals(DocumentFilterConfig.NODE_MINDATE))
+ os.removeChild(i);
+ else
+ i++;
+ }
+ SpecificationNode sn = new SpecificationNode(DocumentFilterConfig.NODE_MINDATE);
+ sn.setAttribute(DocumentFilterConfig.ATTRIBUTE_VALUE,new Long(theTime).toString());
+ os.addChild(os.getChildCount(),sn);
+ }
+
String x;
x = variableContext.getParameter(seqPrefix+"minfilesize");
@@ -331,7 +416,8 @@ public class DocumentFilter extends org.
}
- protected static void fillSet(Set<String> set, String input) {
+ protected static Set<String> fillSet(String input) {
+ Set<String> rval = new HashSet<String>();
try
{
StringReader sr = new StringReader(input);
@@ -340,8 +426,10 @@ public class DocumentFilter extends org.
while ((line = br.readLine()) != null)
{
line = line.trim();
- if (line.length() > 0)
- set.add(line.toLowerCase(Locale.ROOT));
+ if (line.equals("*"))
+ rval = null;
+ else if (rval != null && line.length() > 0)
+ rval.add(line.toLowerCase(Locale.ROOT));
}
}
catch (IOException e)
@@ -349,16 +437,21 @@ public class DocumentFilter extends org.
// Should never happen
throw new RuntimeException("IO exception reading strings: "+e.getMessage(),e);
}
+ return rval;
}
protected static class SpecPacker {
- private final Set<String> extensions = new HashSet<String>();
- private final Set<String> mimeTypes = new HashSet<String>();
+ // null means "match everything"
+ private final Set<String> extensions;
+ // null means "match everything"
+ private final Set<String> mimeTypes;
private final Long minLength;
private final Long lengthCutoff;
+ private final Long minDate;
public SpecPacker(Specification os) {
+ Long minDate = null;
Long minLength = null;
Long lengthCutoff = null;
String extensions = null;
@@ -376,12 +469,16 @@ public class DocumentFilter extends org.
} else if (sn.getType().equals(DocumentFilterConfig.NODE_MINLENGTH)) {
String value = sn.getAttributeValue(DocumentFilterConfig.ATTRIBUTE_VALUE);
minLength = new Long(value);
+ } else if (sn.getType().equals(DocumentFilterConfig.NODE_MINDATE)) {
+ String value = sn.getAttributeValue(DocumentFilterConfig.ATTRIBUTE_VALUE);
+ minDate = new Long(value);
}
}
+ this.minDate = minDate;
this.minLength = minLength;
this.lengthCutoff = lengthCutoff;
- fillSet(this.extensions, extensions);
- fillSet(this.mimeTypes, mimeTypes);
+ this.extensions = fillSet(extensions);
+ this.mimeTypes = fillSet(mimeTypes);
}
public String toPackedString() {
@@ -397,22 +494,34 @@ public class DocumentFilter extends org.
}
// Mime types
- String[] mimeTypes = new String[this.mimeTypes.size()];
- i = 0;
- for (String mimeType : this.mimeTypes) {
- mimeTypes[i++] = mimeType;
+ if (this.mimeTypes == null)
+ sb.append('-');
+ else
+ {
+ sb.append('+');
+ String[] mimeTypes = new String[this.mimeTypes.size()];
+ i = 0;
+ for (String mimeType : this.mimeTypes) {
+ mimeTypes[i++] = mimeType;
+ }
+ java.util.Arrays.sort(mimeTypes);
+ packList(sb,mimeTypes,'+');
}
- java.util.Arrays.sort(mimeTypes);
- packList(sb,mimeTypes,'+');
// Extensions
- String[] extensions = new String[this.extensions.size()];
- i = 0;
- for (String extension : this.extensions) {
- extensions[i++] = extension;
+ if (this.extensions == null)
+ sb.append('-');
+ else
+ {
+ sb.append('+');
+ String[] extensions = new String[this.extensions.size()];
+ i = 0;
+ for (String extension : this.extensions) {
+ extensions[i++] = extension;
+ }
+ java.util.Arrays.sort(extensions);
+ packList(sb,extensions,'+');
}
- java.util.Arrays.sort(extensions);
- packList(sb,extensions,'+');
// Min length
if (minLength == null)
@@ -422,6 +531,14 @@ public class DocumentFilter extends org.
pack(sb,minLength.toString(),'+');
}
+ // Min date
+ if (minDate == null)
+ sb.append('-');
+ else {
+ sb.append('+');
+ pack(sb,minDate.toString(),'+');
+ }
+
return sb.toString();
}
@@ -433,9 +550,17 @@ public class DocumentFilter extends org.
return true;
}
+ public boolean checkDate(Date date) {
+ if (minDate != null && date != null && date.getTime() < minDate)
+ return false;
+ return true;
+ }
+
public boolean checkMimeType(String mimeType) {
if (mimeType == null)
mimeType = "application/unknown";
+ if (mimeTypes == null)
+ return true;
return mimeTypes.contains(mimeType.toLowerCase(Locale.ROOT));
}
@@ -453,6 +578,8 @@ public class DocumentFilter extends org.
}
if (extension == null || extension.length() == 0)
extension = ".";
+ if (extensions == null)
+ return true;
return extensions.contains(extension.toLowerCase(Locale.ROOT));
}
Modified: manifoldcf/trunk/connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation/documentfilter/DocumentFilterConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation/documentfilter/DocumentFilterConfig.java?rev=1630623&r1=1630622&r2=1630623&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation/documentfilter/DocumentFilterConfig.java (original)
+++ manifoldcf/trunk/connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation/documentfilter/DocumentFilterConfig.java Fri Oct 10 00:11:30 2014
@@ -29,10 +29,11 @@ public class DocumentFilterConfig {
public static final String NODE_MINLENGTH = "minlength";
public static final String MINLENGTH_DEFAULT = "0";
public static final String NODE_MAXLENGTH = "maxlength";
- public static final String MAXLENGTH_DEFAULT = "16777216";
+ public static final String MAXLENGTH_DEFAULT = "1000000000";
public static final String NODE_MIMETYPES = "mimetypes";
public static final String MIMETYPES_DEFAULT =
- "application/msword\n"
+ "*\n"
+ + "application/msword\n"
+ "application/vnd.ms-excel\n"
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document\n"
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet\n"
@@ -48,8 +49,9 @@ public class DocumentFilterConfig {
+ "application/x-bittorrent";
public static final String NODE_EXTENSIONS = "extensions";
public static final String EXTENSIONS_DEFAULT =
- "doc\n" + "docx\n" + "xls\n" + "xlsx\n" + "ppt\n" + "pptx\n"
+ "*\n" + "log\n" + "doc\n" + "docx\n" + "xls\n" + "xlsx\n" + "ppt\n" + "pptx\n"
+ "html\n" + "pdf\n" + "odt\n" + "ods\n" + "rtf\n" + "txt\n" + "mp3\n"
+ "mp4\n" + "wav\n" + "ogg\n" + "flac\n" + "torrent";
+ public static final String NODE_MINDATE = "mindate";
public static final String ATTRIBUTE_VALUE = "value";
}
Modified: manifoldcf/trunk/connectors/documentfilter/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/documentfilter/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/documentfilter/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/documentfilter/common_en_US.properties?rev=1630623&r1=1630622&r2=1630623&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/documentfilter/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/documentfilter/common_en_US.properties (original)
+++ manifoldcf/trunk/connectors/documentfilter/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/documentfilter/common_en_US.properties Fri Oct 10 00:11:30 2014
@@ -18,3 +18,17 @@ DocumentFilter.MinFileSizeBytesColon=Min
DocumentFilter.MaxFileSizeBytesColon=Max file size (bytes):
DocumentFilter.AllowedMIMETypesColon=Allowed MIME types:
DocumentFilter.AllowedFileExtensionsColon=Allowed file extensions:
+
+DocumentFilter.MinDateColon=Minimum document date:
+DocumentFilter.Month_0=January
+DocumentFilter.Month_1=February
+DocumentFilter.Month_2=March
+DocumentFilter.Month_3=April
+DocumentFilter.Month_4=May
+DocumentFilter.Month_5=June
+DocumentFilter.Month_6=July
+DocumentFilter.Month_7=August
+DocumentFilter.Month_8=September
+DocumentFilter.Month_9=October
+DocumentFilter.Month_10=November
+DocumentFilter.Month_11=December
Modified: manifoldcf/trunk/connectors/documentfilter/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/documentfilter/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/documentfilter/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/documentfilter/common_ja_JP.properties?rev=1630623&r1=1630622&r2=1630623&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/documentfilter/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/documentfilter/common_ja_JP.properties (original)
+++ manifoldcf/trunk/connectors/documentfilter/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/documentfilter/common_ja_JP.properties Fri Oct 10 00:11:30 2014
@@ -18,3 +18,17 @@ DocumentFilter.MinFileSizeBytesColon=æ�
DocumentFilter.MaxFileSizeBytesColon=æ大ãã¡ã¤ã«ãµã¤ãº (ãã¤ã):
DocumentFilter.AllowedMIMETypesColon=å©ç¨å¯è½ãªMIMEã¿ã¤ãï¼
DocumentFilter.AllowedFileExtensionsColon=å©ç¨å¯è½ãªãã¡ã¤ã«æ¡å¼µåï¼
+
+DocumentFilter.MinDateColon=Minimum document date:
+DocumentFilter.Month_0=January
+DocumentFilter.Month_1=February
+DocumentFilter.Month_2=March
+DocumentFilter.Month_3=April
+DocumentFilter.Month_4=May
+DocumentFilter.Month_5=June
+DocumentFilter.Month_6=July
+DocumentFilter.Month_7=August
+DocumentFilter.Month_8=September
+DocumentFilter.Month_9=October
+DocumentFilter.Month_10=November
+DocumentFilter.Month_11=December
Modified: manifoldcf/trunk/connectors/documentfilter/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/documentfilter/common_zh_CN.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/documentfilter/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/documentfilter/common_zh_CN.properties?rev=1630623&r1=1630622&r2=1630623&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/documentfilter/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/documentfilter/common_zh_CN.properties (original)
+++ manifoldcf/trunk/connectors/documentfilter/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/documentfilter/common_zh_CN.properties Fri Oct 10 00:11:30 2014
@@ -18,3 +18,17 @@ DocumentFilter.MinFileSizeBytesColon=æ�
DocumentFilter.MaxFileSizeBytesColon=æ大æ件大å°(åè):
DocumentFilter.AllowedMIMETypesColon=å¯å©ç¨çMIMEç±»å:
DocumentFilter.AllowedFileExtensionsColon=å¯å©ç¨çæ件æ©å±å:
+
+DocumentFilter.MinDateColon=Minimum document date:
+DocumentFilter.Month_0=January
+DocumentFilter.Month_1=February
+DocumentFilter.Month_2=March
+DocumentFilter.Month_3=April
+DocumentFilter.Month_4=May
+DocumentFilter.Month_5=June
+DocumentFilter.Month_6=July
+DocumentFilter.Month_7=August
+DocumentFilter.Month_8=September
+DocumentFilter.Month_9=October
+DocumentFilter.Month_10=November
+DocumentFilter.Month_11=December
Modified: manifoldcf/trunk/connectors/documentfilter/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/documentfilter/editSpecification_Contents.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/documentfilter/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/documentfilter/editSpecification_Contents.html?rev=1630623&r1=1630622&r2=1630623&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/documentfilter/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/documentfilter/editSpecification_Contents.html (original)
+++ manifoldcf/trunk/connectors/documentfilter/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/documentfilter/editSpecification_Contents.html Fri Oct 10 00:11:30 2014
@@ -19,6 +19,61 @@
<table class="displaytable">
<tr>
+ <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('DocumentFilter.MinDateColon'))</nobr></td>
+ <td class="value">
+ <nobr>
+ <select name="s${SEQNUM}_mindateyear" size="5">
+ #foreach ($year in [1969..2020])
+ #if ($MINDATEYEAR == $year)
+ <option value="$year" selected="true">$year</option>
+ #else
+ <option value="$year">$year</option>
+ #end
+ #end
+ </select>
+ <select name="s${SEQNUM}_mindatemonth" size="5">
+ #foreach ($month in [0..11])
+ #set ($monthkey = "DocumentFilter.Month_${month}")
+ #if ($MINDATEMONTH == $month)
+ <option value="$month" selected="true">$Encoder.bodyEscape($ResourceBundle.getString($monthkey))</option>
+ #else
+ <option value="$month">$Encoder.bodyEscape($ResourceBundle.getString($monthkey))</option>
+ #end
+ #end
+ </select>
+ <select name="s${SEQNUM}_mindateday" size="5">
+ #foreach ($day in [1..31])
+ #set ($dayvalue = $day)
+ #if ($MINDATEDAY == $day)
+ <option value="$day" selected="true">$dayvalue</option>
+ #else
+ <option value="$day">$dayvalue</option>
+ #end
+ #end
+ </select>
+ <select name="s${SEQNUM}_mindatehour" size="5">
+ #foreach ($hour in [0..23])
+ #if ($MINDATEHOUR == $hour)
+ <option value="$hour" selected="true">$hour</option>
+ #else
+ <option value="$hour">$hour</option>
+ #end
+ #end
+ </select>
+ :
+ <select name="s${SEQNUM}_mindateminute" size="5">
+ #foreach ($minute in ["00","01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39","40","41","42","43","44","45","46","47","48","49","50","51","52","53","54","55","56","57","58","59"])
+ #if ($MINDATEMINUTE == $minute)
+ <option value="$minute" selected="true">$minute</option>
+ #else
+ <option value="$minute">$minute</option>
+ #end
+ #end
+ </select>
+ </nobr>
+ </td>
+ </tr>
+ <tr>
<td class="description">
<nobr>$Encoder.bodyEscape($ResourceBundle.getString('DocumentFilter.MinFileSizeBytesColon'))</nobr>
</td>
@@ -50,6 +105,12 @@
#else
+<input type="hidden" name="s${SEQNUM}_mindateyear" value="$MINDATEYEAR" />
+<input type="hidden" name="s${SEQNUM}_mindatemonth" value="$MINDATEMONTH" />
+<input type="hidden" name="s${SEQNUM}_mindateday" value="$MINDATEDAY" />
+<input type="hidden" name="s${SEQNUM}_mindatehour" value="$MINDATEHOUR" />
+<input type="hidden" name="s${SEQNUM}_mindateminute" value="$MINDATEMINUTE" />
+
<input type="hidden" name="s${SEQNUM}_minfilesize" value="$Encoder.attributeEscape($MINFILESIZE)" />
<input type="hidden" name="s${SEQNUM}_maxfilesize" value="$Encoder.attributeEscape($MAXFILESIZE)" />
<input type="hidden" name="s${SEQNUM}_mimetypes" value="$Encoder.attributeEscape($MIMETYPES)" />
Modified: manifoldcf/trunk/connectors/documentfilter/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/documentfilter/viewSpecification.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/documentfilter/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/documentfilter/viewSpecification.html?rev=1630623&r1=1630622&r2=1630623&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/documentfilter/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/documentfilter/viewSpecification.html (original)
+++ manifoldcf/trunk/connectors/documentfilter/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/documentfilter/viewSpecification.html Fri Oct 10 00:11:30 2014
@@ -15,8 +15,17 @@
limitations under the License.
-->
+#set ($month = "DocumentFilter.Month_${MINDATEMONTH}")
<table class="displaytable">
<tr>
+ <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('DocumentFilter.MinDateColon'))</nobr></td>
+ <td class="value">
+ <nobr>
+ $MINDATEYEAR $Encoder.bodyEscape($ResourceBundle.getString($month)) $MINDATEDAY $MINDATEHOUR:$MINDATEMINUTE
+ </nobr>
+ </td>
+ </tr>
+ <tr>
<td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('DocumentFilter.MinFileSizeBytesColon'))</nobr></td>
<td class="value">$Encoder.bodyEscape($MINFILESIZE)</td>
</tr>
Modified: manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java?rev=1630623&r1=1630622&r2=1630623&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java (original)
+++ manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java Fri Oct 10 00:11:30 2014
@@ -298,31 +298,23 @@ public class FileConnector extends org.a
String versionString;
String convertPath;
long fileLength = file.length();
- if (activities.checkLengthIndexable(fileLength))
- {
- // Get the file's modified date.
- long lastModified = file.lastModified();
+ // Get the file's modified date.
+ long lastModified = file.lastModified();
- // Check if the path is to be converted. We record that info in the version string so that we'll reindex documents whose
- // URI's change.
- convertPath = findConvertPath(spec, file);
- StringBuilder sb = new StringBuilder();
- if (convertPath != null)
- {
- // Record the path.
- sb.append("+");
- pack(sb,convertPath,'+');
- }
- else
- sb.append("-");
- sb.append(new Long(lastModified).toString()).append(":").append(new Long(fileLength).toString());
- versionString = sb.toString();
- }
- else
+ // Check if the path is to be converted. We record that info in the version string so that we'll reindex documents whose
+ // URI's change.
+ convertPath = findConvertPath(spec, file);
+ StringBuilder sb = new StringBuilder();
+ if (convertPath != null)
{
- activities.deleteDocument(documentIdentifier);
- continue;
+ // Record the path.
+ sb.append("+");
+ pack(sb,convertPath,'+');
}
+ else
+ sb.append("-");
+ sb.append(new Long(lastModified).toString()).append(":").append(new Long(fileLength).toString());
+ versionString = sb.toString();
if (activities.checkDocumentNeedsReindexing(documentIdentifier,versionString))
{
@@ -345,6 +337,7 @@ public class FileConnector extends org.a
{
Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because length was excluded by output connector.");
activities.noDocument(documentIdentifier,versionString);
+ activities.recordActivity(null,ACTIVITY_READ,null,documentIdentifier,"FILETOOLONG","Document rejected because of length",null);
continue;
}
@@ -352,6 +345,7 @@ public class FileConnector extends org.a
{
Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because URL was excluded by output connector.");
activities.noDocument(documentIdentifier,versionString);
+ activities.recordActivity(null,ACTIVITY_READ,null,documentIdentifier,"URLREJECTED","Document rejected because of URL",null);
continue;
}
@@ -359,6 +353,7 @@ public class FileConnector extends org.a
{
Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because date ("+modifiedDate+") was excluded by output connector.");
activities.noDocument(documentIdentifier,versionString);
+ activities.recordActivity(null,ACTIVITY_READ,null,documentIdentifier,"DATEREJECTED","Document rejected because of date",null);
continue;
}
@@ -366,6 +361,7 @@ public class FileConnector extends org.a
{
Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because mime type ('"+mimeType+"') was excluded by output connector.");
activities.noDocument(documentIdentifier,versionString);
+ activities.recordActivity(null,ACTIVITY_READ,null,documentIdentifier,"MIMETYPEREJECTED","Document rejected because of mime type",null);
continue;
}