You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2015/12/06 22:14:06 UTC
svn commit: r1718223 - in /nutch/trunk: CHANGES.txt
conf/contenttype-mapping.txt.template
src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Author: snagel
Date: Sun Dec 6 21:14:06 2015
New Revision: 1718223
URL: http://svn.apache.org/viewvc?rev=1718223&view=rev
Log:
NUTCH-2172 index-more: document format of contenttype-mapping.txt
Added:
nutch/trunk/conf/contenttype-mapping.txt.template
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1718223&r1=1718222&r2=1718223&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sun Dec 6 21:14:06 2015
@@ -1,5 +1,7 @@
Nutch Change Log
-
+
+* NUTCH-2172 index-more: document format of contenttype-mapping.txt (Nicola Tonellotto, snagel)
+
Nutch 1.11 Release 03/12/2015 (dd/mm/yyyy)
Release Report: http://s.apache.org/nutch11
Added: nutch/trunk/conf/contenttype-mapping.txt.template
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/contenttype-mapping.txt.template?rev=1718223&view=auto
==============================================================================
--- nutch/trunk/conf/contenttype-mapping.txt.template (added)
+++ nutch/trunk/conf/contenttype-mapping.txt.template Sun Dec 6 21:14:06 2015
@@ -0,0 +1,22 @@
+#
+# Mapping of detected content types (MIME types) to custom types (target types)
+# used by the plugin index-more when filling the index field `type'.
+#
+# Note: The mappings defined in this file are only active if the property
+# `moreIndexingFilter.mapMimeTypes' is true.
+#
+# Format (tab-separated plain text, comment lines start with `#'):
+#
+# <target type> <TAB> <detected type1> [<TAB> <detected type2> ...]
+#
+# Examples (comment in to activate):
+#
+# map XHTML to HTML
+#text/html application/xhtml+xml
+#
+# Map XHTML and HTML to a custom type "web page"
+#web page text/html application/xhtml+xml
+#
+# map various office document formats to a custom type "office document"
+#office document application/vnd.oasis.opendocument.text application/x-tika-msoffice application/msword
+#
Modified: nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1718223&r1=1718222&r2=1718223&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Sun Dec 6 21:14:06 2015
@@ -312,10 +312,12 @@ public class MoreIndexingFilter implemen
}
private void readConfiguration() throws IOException {
+ LOG.info("Reading content type mappings from file contenttype-mapping.txt");
BufferedReader reader = new BufferedReader(
conf.getConfResourceAsReader("contenttype-mapping.txt"));
String line;
String parts[];
+ boolean formatWarningShown = false;
mimeMap = new HashMap<String, String>();
@@ -329,6 +331,12 @@ public class MoreIndexingFilter implemen
for (int i = 1; i < parts.length; i++) {
mimeMap.put(parts[i].trim(), parts[0].trim());
}
+ } else {
+ LOG.warn("Wrong format of line: {}", line);
+ if (!formatWarningShown) {
+ LOG.warn("Expected format: <target type> <tab> <type1> [<tab> <type2> ...]");
+ formatWarningShown = true;
+ }
}
}
}