You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/09/08 21:45:01 UTC
svn commit: r279605 [3/4] - in /lucene/nutch/branches/mapred: ./ bin/ conf/
docs/ca/ docs/de/ docs/en/ docs/es/ docs/fi/ docs/fr/ docs/hu/ docs/jp/
docs/ms/ docs/nl/ docs/pl/ docs/pt/ docs/sv/ docs/th/ docs/zh/ lib/ site/
src/java/org/apache/nutch/anal...
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java Thu Sep 8 12:42:44 2005
@@ -129,7 +129,8 @@
}
/** The media type of the retrieved content.
- * @see http://www.iana.org/assignments/media-types/
+ * @see <a href="http://www.iana.org/assignments/media-types/">
+ * http://www.iana.org/assignments/media-types/</a>
*/
public String getContentType() {
ensureInflated();
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolException.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolException.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolException.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolException.java Thu Sep 8 12:42:44 2005
@@ -18,7 +18,6 @@
import java.net.URL;
-/** Thrown by {@link Protocol#getContent(String)}.*/
public class ProtocolException extends Exception {
public ProtocolException() {
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolFactory.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolFactory.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolFactory.java Thu Sep 8 12:42:44 2005
@@ -75,7 +75,7 @@
Extension extension = findExtension(name);
- CACHE.put(name, extension);
+ if (extension != null) CACHE.put(name, extension);
return extension;
}
@@ -83,7 +83,7 @@
private static Extension findExtension(String name)
throws PluginRuntimeException {
- Extension[] extensions = X_POINT.getExtentens();
+ Extension[] extensions = X_POINT.getExtensions();
for (int i = 0; i < extensions.length; i++) {
Extension extension = extensions[i];
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ResourceGone.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ResourceGone.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ResourceGone.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ResourceGone.java Thu Sep 8 12:42:44 2005
@@ -19,7 +19,7 @@
import java.io.IOException;
import java.net.URL;
-/** Thrown by {@link Protocol#getContent(String)} when a {@link URL} is invalid.*/
+/** Thrown when a resource is invalid. */
public class ResourceGone extends ProtocolException {
private URL url;
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ResourceMoved.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ResourceMoved.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ResourceMoved.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ResourceMoved.java Thu Sep 8 12:42:44 2005
@@ -19,8 +19,7 @@
import java.io.IOException;
import java.net.URL;
-/** Thrown by {@link Protocol#getContent(String)} when a {@link URL} no longer
- * exists.*/
+/** Thrown when a resource no longer exists.*/
public class ResourceMoved extends IOException {
private URL oldUrl;
private URL newUrl;
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/RetryLater.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/RetryLater.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/RetryLater.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/RetryLater.java Thu Sep 8 12:42:44 2005
@@ -19,8 +19,7 @@
import java.io.IOException;
import java.net.URL;
-/** Thrown by {@link Protocol#getContent(String)} when a {@link URL} should be
- * retried later.*/
+/** Thrown when a resource should be retried later.*/
public class RetryLater extends ProtocolException {
private URL url;
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java Thu Sep 8 12:42:44 2005
@@ -162,7 +162,7 @@
continue;
}
for (int j = 0; j < segments.length; j++) {
- LOG.info("Client: segment "+segments[j]+" at "+addr);
+ LOG.finest("Client: segment "+segments[j]+" at "+addr);
segmentToAddress.put(segments[j], addr);
}
liveAddresses.add(addr);
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/Hits.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/Hits.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/Hits.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/Hits.java Thu Sep 8 12:42:44 2005
@@ -44,14 +44,14 @@
}
/** Returns the total number of hits for this query. This may be an estimate
- * when (@link totalIsExact()} is false. */
+ * when (@link #totalIsExact()} is false. */
public long getTotal() { return total; }
- /** True if {@link getTotal()} gives the exact number of hits, or false if
+ /** True if {@link #getTotal()} gives the exact number of hits, or false if
* it is only an estimate of the total number of hits. */
public boolean totalIsExact() { return totalIsExact; }
- /** Set {@link totalIsExact()}. */
+ /** Set {@link #totalIsExact()}. */
public void setTotalIsExact(boolean isExact) { totalIsExact = isExact; }
/** Returns the number of hits included in this current listing. */
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/OpenSearchServlet.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/OpenSearchServlet.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/OpenSearchServlet.java Thu Sep 8 12:42:44 2005
@@ -17,7 +17,6 @@
package org.apache.nutch.searcher;
import java.io.IOException;
-import java.net.URL;
import java.net.URLEncoder;
import java.util.logging.Level;
import java.util.Map;
@@ -38,12 +37,6 @@
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
-import org.apache.nutch.html.Entities;
-import org.apache.nutch.searcher.*;
-import org.apache.nutch.plugin.*;
-import org.apache.nutch.clustering.*;
-import org.apache.nutch.util.NutchConf;
-
/** Present search results using A9's OpenSearch extensions to RSS, plus a few
* Nutch-specific extensions. */
@@ -74,7 +67,7 @@
public void doGet(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
- bean.LOG.info("query request from " + request.getRemoteAddr());
+ NutchBean.LOG.info("query request from " + request.getRemoteAddr());
// get parameters from request
request.setCharacterEncoding("UTF-8");
@@ -122,7 +115,7 @@
(dedupField == null ? "" : "&dedupField=" + dedupField));
Query query = Query.parse(queryString);
- bean.LOG.info("query: " + queryString);
+ NutchBean.LOG.info("query: " + queryString);
// execute the query
Hits hits;
@@ -130,11 +123,11 @@
hits = bean.search(query, start + hitsPerPage, hitsPerDup, dedupField,
sort, reverse);
} catch (IOException e) {
- bean.LOG.log(Level.WARNING, "Search Error", e);
+ NutchBean.LOG.log(Level.WARNING, "Search Error", e);
hits = new Hits(0,new Hit[0]);
}
- bean.LOG.info("total hits: " + hits.getTotal());
+ NutchBean.LOG.info("total hits: " + hits.getTotal());
// generate xml results
int end = (int)Math.min(hits.getLength(), start + hitsPerPage);
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/QueryFilters.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/QueryFilters.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/QueryFilters.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/QueryFilters.java Thu Sep 8 12:42:44 2005
@@ -45,7 +45,7 @@
.getExtensionPoint(QueryFilter.X_POINT_ID);
if (point == null)
throw new RuntimeException(QueryFilter.X_POINT_ID+" not found.");
- Extension[] extensions = point.getExtentens();
+ Extension[] extensions = point.getExtensions();
CACHE = new QueryFilter[extensions.length];
for (int i = 0; i < extensions.length; i++) {
Extension extension = extensions[i];
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java Thu Sep 8 12:42:44 2005
@@ -179,7 +179,8 @@
* @param withParseText if true, fix parse_text, otherwise ignore it
* @param withParseData if true, fix parse_data, otherwise ignore it
* @param dryrun if true, only show what would be done without performing any actions
- * @return
+ * @return <code>true</code> if segment was fixed successfully, otherwise
+ * return <code>false</code>.
*/
public static boolean fixSegment(NutchFileSystem nfs, File dir,
boolean withContent, boolean withParseText, boolean withParseData,
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java Thu Sep 8 12:42:44 2005
@@ -69,6 +69,9 @@
final private static float DECAY_VALUE = 0.85f;
public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.tools.DistributedAnalysisTool");
+
+ public final static long OUTLINK_LIMIT = 10000;
+
/**
* The EditSet inner class represents all of the sorted edits
@@ -343,8 +346,10 @@
try {
// Iterate through all items in the webdb, sorted by URL
long curIndex = 0;
+ long linkCount = 0;
ScoreValue score = new ScoreValue();
IWebDBReader reader = new WebDBReader(nfs, dbDir);
+ MD5Hash lastHash = null;
try {
for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); curIndex++) {
//
@@ -366,7 +371,25 @@
// OK, do some analysis!
//
Page curPage = (Page) e.nextElement();
+
+ // Process only one page from set of pages having the same
+ // MD5. Otherwise all links from these pages would be processed
+ // multiple times.
+ MD5Hash newHash = curPage.getMD5();
+ if (newHash.equals(lastHash)) {
+ continue;
+ }
+ lastHash = newHash;
+
Link outLinks[] = reader.getLinks(curPage.getMD5());
+ linkCount += outLinks.length;
+
+ if (outLinks.length > OUTLINK_LIMIT) {
+ LOG.info("Suspicious outlink count = "
+ + outLinks.length + " for ["
+ + curPage.getURL().toString() + "].");
+ }
+
int targetOutlinkers = 0;
for (int i = 0; i < outLinks.length; i++) {
if (outLinks[i].targetHasOutlink()) {
@@ -402,7 +425,9 @@
}
if (((curIndex - startIndex) % 5000) == 0) {
- LOG.info("Pages consumed: " + (curIndex - startIndex) + " (at index " + curIndex + ")");
+ LOG.info("Pages consumed: " + (curIndex - startIndex)
+ + " (at index " + curIndex
+ + "). Links fetched: " + linkCount + ".");
}
}
} finally {
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/Daemon.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/Daemon.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/Daemon.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/Daemon.java Thu Sep 8 12:42:44 2005
@@ -16,7 +16,7 @@
package org.apache.nutch.util;
-/** A thread that has called {@link Thread#SetDaemon(boolean) } with true.*/
+/** A thread that has called {@link Thread#setDaemon(boolean) } with true.*/
public class Daemon extends Thread {
{
Modified: lucene/nutch/branches/mapred/src/plugin/build-plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/build-plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/build-plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/build-plugin.xml Thu Sep 8 12:42:44 2005
@@ -9,7 +9,7 @@
<!-- Load all the default properties, and any the user wants -->
<!-- to contribute (without having to type -D or edit this file -->
- <property file="${user.home}/$(name}.build.properties" />
+ <property file="${user.home}/${name}.build.properties" />
<property file="${root}/build.properties" />
<property name="nutch.root" location="${root}/../../../"/>
@@ -35,6 +35,8 @@
<property name="build.encoding" value="ISO-8859-1"/>
+ <path id="plugin.deps"/>
+
<fileset id="lib.jars" dir="${root}" includes="lib/*.jar"/>
<!-- the normal classpath -->
@@ -45,6 +47,7 @@
<fileset dir="${nutch.root}/lib">
<include name="*.jar" />
</fileset>
+ <path refid="plugin.deps"/>
</path>
<!-- the unit test classpath -->
@@ -108,7 +111,7 @@
<mkdir dir="${deploy.dir}"/>
<copy file="plugin.xml" todir="${deploy.dir}"
preservelastmodified="true"/>
- <copy file="${build.dir}/${name}.jar" todir="${deploy.dir}"/>
+ <copy file="${build.dir}/${name}.jar" todir="${deploy.dir}" failonerror="false"/>
<copy todir="${deploy.dir}" flatten="true">
<fileset refid="lib.jars"/>
</copy>
Modified: lucene/nutch/branches/mapred/src/plugin/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/build.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/build.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/build.xml Thu Sep 8 12:42:44 2005
@@ -6,6 +6,8 @@
<!-- Build & deploy all the plugin jars. -->
<!-- ====================================================== -->
<target name="deploy">
+ <ant dir="lib-jakarta-poi" target="deploy"/>
+ <ant dir="nutch-extensionpoints" target="deploy"/>
<ant dir="protocol-file" target="deploy"/>
<ant dir="protocol-ftp" target="deploy"/>
<ant dir="protocol-http" target="deploy"/>
@@ -14,10 +16,13 @@
<ant dir="parse-js" target="deploy"/>
<ant dir="parse-text" target="deploy"/>
<ant dir="parse-pdf" target="deploy"/>
+ <ant dir="parse-rss" target="deploy"/>
<ant dir="parse-msword" target="deploy"/>
+ <ant dir="parse-mspowerpoint" target="deploy"/>
<!-- <ant dir="parse-mp3" target="deploy"/> -->
<!-- <ant dir="parse-rtf" target="deploy"/> -->
<ant dir="parse-ext" target="deploy"/>
+ <ant dir="parse-zip" target="deploy"/>
<ant dir="index-basic" target="deploy"/>
<ant dir="index-more" target="deploy"/>
<ant dir="query-basic" target="deploy"/>
@@ -39,10 +44,13 @@
<ant dir="protocol-http" target="test"/>
<ant dir="parse-html" target="test"/>
<ant dir="parse-pdf" target="test"/>
+ <ant dir="parse-rss" target="test"/>
<ant dir="parse-msword" target="test"/>
+ <ant dir="parse-mspowerpoint" target="test"/>
<!-- <ant dir="parse-mp3" target="test"/> -->
<!-- <ant dir="parse-rtf" target="test"/> -->
<ant dir="parse-ext" target="test"/>
+ <ant dir="parse-zip" target="test"/>
<ant dir="creativecommons" target="test"/>
<ant dir="languageidentifier" target="test"/>
<ant dir="ontology" target="test"/>
@@ -52,6 +60,8 @@
<!-- Clean all of the plugins. -->
<!-- ====================================================== -->
<target name="clean">
+ <ant dir="lib-jakarta-poi" target="clean"/>
+ <ant dir="nutch-extensionpoints" target="clean"/>
<ant dir="protocol-file" target="clean"/>
<ant dir="protocol-ftp" target="clean"/>
<ant dir="protocol-http" target="clean"/>
@@ -60,10 +70,13 @@
<ant dir="parse-js" target="clean"/>
<ant dir="parse-text" target="clean"/>
<ant dir="parse-pdf" target="clean"/>
+ <ant dir="parse-rss" target="clean"/>
<ant dir="parse-msword" target="clean"/>
+ <ant dir="parse-mspowerpoint" target="clean"/>
<ant dir="parse-mp3" target="clean"/>
<ant dir="parse-rtf" target="clean"/>
<ant dir="parse-ext" target="clean"/>
+ <ant dir="parse-zip" target="clean"/>
<ant dir="index-basic" target="clean"/>
<ant dir="index-more" target="clean"/>
<ant dir="query-basic" target="clean"/>
@@ -74,8 +87,8 @@
<ant dir="urlfilter-prefix" target="clean"/>
<ant dir="creativecommons" target="clean"/>
<ant dir="languageidentifier" target="clean"/>
- <ant dir="clustering-carrot2" target="deploy"/>
- <ant dir="ontology" target="deploy"/>
+ <ant dir="clustering-carrot2" target="clean"/>
+ <ant dir="ontology" target="clean"/>
</target>
</project>
Modified: lucene/nutch/branches/mapred/src/plugin/clustering-carrot2/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/clustering-carrot2/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/clustering-carrot2/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/clustering-carrot2/plugin.xml Thu Sep 8 12:42:44 2005
@@ -5,10 +5,6 @@
version="0.9.0"
provider-name="carrot2.sourceforge.net">
- <extension-point
- id="org.apache.nutch.clustering.OnlineClusterer"
- name="Nutch Online Search Results Clustering Plugin"/>
-
<runtime>
<library name="clustering-carrot2.jar">
<export name="*"/>
Modified: lucene/nutch/branches/mapred/src/plugin/creativecommons/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/creativecommons/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/creativecommons/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/creativecommons/plugin.xml Thu Sep 8 12:42:44 2005
@@ -5,18 +5,6 @@
version="1.0.0"
provider-name="nutch.org">
- <extension-point
- id="org.apache.nutch.parse.HtmlParseFilter"
- name="HTML Parse Filter"/>
-
- <extension-point
- id="org.apache.nutch.indexer.IndexingFilter"
- name="Nutch Indexing Filter"/>
-
- <extension-point
- id="org.apache.nutch.searcher.QueryFilter"
- name="Nutch Query Filter"/>
-
<runtime>
<library name="creativecommons.jar">
<export name="*"/>
Modified: lucene/nutch/branches/mapred/src/plugin/index-basic/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/index-basic/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/index-basic/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/index-basic/plugin.xml Thu Sep 8 12:42:44 2005
@@ -5,9 +5,6 @@
version="1.0.0"
provider-name="nutch.org">
- <extension-point
- id="org.apache.nutch.indexer.IndexingFilter"
- name="Nutch Indexing Filter"/>
<runtime>
<library name="index-basic.jar">
Modified: lucene/nutch/branches/mapred/src/plugin/index-more/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/index-more/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/index-more/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/index-more/plugin.xml Thu Sep 8 12:42:44 2005
@@ -5,15 +5,6 @@
version="1.0.0"
provider-name="nutch.org">
- <extension-point
- id="org.apache.nutch.indexer.IndexingFilter"
- name="Nutch Indexing Filter"/>
-
- <!--
- <extension-point
- id="org.apache.nutch.searcher.QueryFilter"
- name="Nutch Query Filter"/>
- -->
<runtime>
<library name="index-more.jar">
Modified: lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Thu Sep 8 12:42:44 2005
@@ -48,11 +48,12 @@
import java.text.SimpleDateFormat;
import java.util.Date;
+import java.util.Locale;
import java.util.TimeZone;
import java.util.Enumeration;
import java.util.Properties;
-
+import org.apache.commons.lang.time.DateUtils;
/**
* Add (or reset) a few metaData properties as respective fields
* (if they are available), so that they can be displayed by more.jsp
@@ -131,15 +132,38 @@
long time = -1;
try {
time = HttpDateFormat.toLong(date);
- } catch (ParseException e) {
- // try to parse it as date in alternative format
- try {
- DateFormat df = new SimpleDateFormat("EEE MMM dd HH:mm:ss yyyy zzz");
- Date d = df.parse(date);
- time = d.getTime();
- } catch (Exception e1) {
- LOG.warning(url+": can't parse erroneous date: "+date);
- }
+ } catch (ParseException e) {
+ // try to parse it as date in alternative format
+ try {
+ Date parsedDate = DateUtils.parseDate(date,
+ new String [] {
+ "EEE MMM dd HH:mm:ss yyyy",
+ "EEE MMM dd HH:mm:ss yyyy zzz",
+ "EEE, MMM dd HH:mm:ss yyyy zzz",
+ "EEE, dd MMM yyyy HH:mm:ss zzz",
+ "EEE,dd MMM yyyy HH:mm:ss zzz",
+ "EEE, dd MMM yyyy HH:mm:sszzz",
+ "EEE, dd MMM yyyy HH:mm:ss",
+ "EEE, dd-MMM-yy HH:mm:ss zzz",
+ "yyyy/MM/dd HH:mm:ss.SSS zzz",
+ "yyyy/MM/dd HH:mm:ss.SSS",
+ "yyyy/MM/dd HH:mm:ss zzz",
+ "yyyy/MM/dd",
+ "yyyy.MM.dd HH:mm:ss",
+ "yyyy-MM-dd HH:mm",
+ "MMM dd yyyy HH:mm:ss. zzz",
+ "MMM dd yyyy HH:mm:ss zzz",
+ "dd.MM.yyyy HH:mm:ss zzz",
+ "dd MM yyyy HH:mm:ss zzz",
+ "dd.MM.yyyy; HH:mm:ss",
+ "dd.MM.yyyy HH:mm:ss",
+ "dd.MM.yyyy zzz"
+ });
+ time = parsedDate.getTime();
+ // LOG.warning(url + ": parsed date: " + date +" to:"+time);
+ } catch (Exception e2) {
+ LOG.warning(url + ": can't parse erroneous date: " + date);
+ }
}
return time;
}
Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml Thu Sep 8 12:42:44 2005
@@ -5,17 +5,7 @@
version="1.0.0"
provider-name="nutch.org">
- <extension-point
- id="org.apache.nutch.parse.HtmlParseFilter"
- name="HTML Parse Filter"/>
- <extension-point
- id="org.apache.nutch.indexer.IndexingFilter"
- name="Nutch Indexing Filter"/>
-
- <extension-point
- id="org.apache.nutch.searcher.QueryFilter"
- name="Nutch Query Filter"/>
<runtime>
<library name="language-identifier.jar">
@@ -29,7 +19,7 @@
<implementation id="LanguageParser"
class="org.apache.nutch.analysis.lang.HTMLLanguageParser"/>
</extension>
-
+
<extension id="org.apache.nutch.analysis.lang"
name="Nutch language identifier filter"
point="org.apache.nutch.indexer.IndexingFilter">
Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Thu Sep 8 12:42:44 2005
@@ -14,23 +14,62 @@
* limitations under the License.
*/
package org.apache.nutch.analysis.lang;
+
+// JDK imports
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Properties;
+import java.util.logging.Logger;
+
+// Nutch imports
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.protocol.Content;
-import org.w3c.dom.*;
-
-import java.util.logging.Logger;
import org.apache.nutch.util.LogFormatter;
-/** Adds metadata identifying language of document if found
+// DOM imports
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+
+/**
+ * Adds metadata identifying language of document if found
* We could also run statistical analysis here but we'd miss all other formats
*/
public class HTMLLanguageParser implements HtmlParseFilter {
+
public static final String META_LANG_NAME="X-meta-lang";
public static final Logger LOG = LogFormatter
.getLogger(HTMLLanguageParser.class.getName());
+ /* A static Map of ISO-639 language codes */
+ private static Map LANGUAGES_MAP = new HashMap();
+ static {
+ try {
+ Properties p = new Properties();
+ p.load(HTMLLanguageParser.class
+ .getResourceAsStream("langmappings.properties"));
+ Enumeration keys = p.keys();
+ while (keys.hasMoreElements()) {
+ String key = (String) keys.nextElement();
+ String[] values = p.getProperty(key).split(",", -1);
+ LANGUAGES_MAP.put(key, key);
+ for (int i=0; i<values.length; i++) {
+ LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key);
+ }
+ }
+ } catch (Exception e) {
+ LOG.severe(e.toString());
+ }
+ }
+
+
+
/**
* Scan the HTML document looking at possible indications of content language<br>
* <li>1. html lang attribute (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1)
@@ -39,60 +78,122 @@
* <br>Only the first occurence of language is stored.
*/
public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
- String lang = findLanguage(doc);
+
+ // Trying to find the document's language
+ LanguageParser parser = new LanguageParser(doc);
+ String lang = parser.getLanguage();
if (lang != null) {
parse.getData().getMetadata().put(META_LANG_NAME, lang);
}
-
return parse;
}
-
- private String findLanguage(Node node) {
- String lang = null;
- if (node.getNodeType() == Node.ELEMENT_NODE) {
-
- //lang attribute
- lang = ((Element) node).getAttribute("lang");
- if (lang != null && lang.length()>1) {
- return lang;
- }
- if ("meta".equalsIgnoreCase(node.getNodeName())) {
+ static class LanguageParser {
+
+ private String dublinCore = null;
+ private String htmlAttribute = null;
+ private String httpEquiv = null;
+ private String language = null;
+
+ LanguageParser(Node node) {
+ parse(node);
+ if (htmlAttribute != null) { language = htmlAttribute; }
+ else if (dublinCore != null) { language = dublinCore; }
+ else {language = httpEquiv; }
+ }
+
+ String getLanguage() {
+ return language;
+ }
+
+ void parse(Node node) {
- NamedNodeMap attrs=node.getAttributes();
+ String lang = null;
+
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+ // Check for the lang HTML attribute
+ if (htmlAttribute == null) {
+ htmlAttribute = parseLanguage(((Element) node).getAttribute("lang"));
+ }
- //dc.language
- for(int i=0;i<attrs.getLength();i++){
- Node attrnode=attrs.item(i);
- if("name".equalsIgnoreCase(attrnode.getNodeName())){
- if("dc.language".equalsIgnoreCase(attrnode.getNodeValue())){
- Node valueattr=attrs.getNamedItem("content");
- lang = (valueattr!=null)?valueattr.getNodeValue():null;
+ // Check for Meta
+ if ("meta".equalsIgnoreCase(node.getNodeName())) {
+ NamedNodeMap attrs = node.getAttributes();
+
+ // Check for the dc.language Meta
+ if (dublinCore == null) {
+ for (int i=0; i<attrs.getLength(); i++) {
+ Node attrnode = attrs.item(i);
+ if ("name".equalsIgnoreCase(attrnode.getNodeName())) {
+ if ("dc.language".equalsIgnoreCase(attrnode.getNodeValue())) {
+ Node valueattr = attrs.getNamedItem("content");
+ if (valueattr != null) {
+ dublinCore = parseLanguage(valueattr.getNodeValue());
+ }
+ }
+ }
}
}
- }
-
- //http-equiv content-language
- for(int i=0;i<attrs.getLength();i++){
- Node attrnode=attrs.item(i);
- if("http-equiv".equalsIgnoreCase(attrnode.getNodeName())){
- if("content-language".equals(attrnode.getNodeValue().toLowerCase())){
- Node valueattr=attrs.getNamedItem("content");
- lang = (valueattr!=null)?valueattr.getNodeValue():null;
+
+ // Check for the http-equiv content-language
+ if (httpEquiv == null) {
+ for (int i=0; i<attrs.getLength(); i++){
+ Node attrnode = attrs.item(i);
+ if ("http-equiv".equalsIgnoreCase(attrnode.getNodeName())) {
+ if ("content-language".equals(attrnode.getNodeValue().toLowerCase())) {
+ Node valueattr = attrs.getNamedItem("content");
+ if (valueattr != null) {
+ httpEquiv = parseLanguage(valueattr.getNodeValue());
+ }
+ }
+ }
}
}
}
}
+
+ // Recurse
+ NodeList children = node.getChildNodes();
+ for (int i=0; children != null && i<children.getLength(); i++) {
+ parse(children.item(i));
+ if ((dublinCore != null) &&
+ (htmlAttribute != null) &&
+ (httpEquiv != null)) {
+ return;
+ }
+ }
}
-
- //recurse
- NodeList children = node.getChildNodes();
- for (int i = 0; children != null && i < children.getLength(); i++) {
- lang = findLanguage(children.item(i));
- if(lang != null && lang.length()>1) return lang;
+
+ /**
+ * Parse a language string and return an ISO 639 primary code,
+ * or <code>null</code> if something wrong occurs, or if no language is found.
+ */
+ final static String parseLanguage(String lang) {
+
+ if (lang == null) { return null; }
+
+ String code = null;
+ String language = null;
+
+ // First, split multi-valued values
+ String langs[] = lang.split(",| |;|\\.|\\(|\\)|=", -1);
+
+ int i = 0;
+ while ((language == null) && (i<langs.length)) {
+ // Then, get the primary code
+ code = langs[i].split("-")[0];
+ code = code.split("_")[0];
+ // Find the ISO 639 code
+ language = (String) LANGUAGES_MAP.get(code.toLowerCase());
+ i++;
+ }
+
+ return language;
}
-
- return lang;
+
}
+
+
}
Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java Thu Sep 8 12:42:44 2005
@@ -20,6 +20,7 @@
import java.io.InputStream;
import java.io.IOException;
import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.List;
@@ -48,9 +49,13 @@
/**
+ * Identify the language of a content, based on statistical analysis.
+ *
+ * @see <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+ * Language Codes</a>
*
* @author Sami Siren
- * @author Jerome Charron
+ * @author Jérôme Charron
*/
public class LanguageIdentifier {
@@ -59,8 +64,8 @@
private final static float SCORE_THRESOLD = 0.00F;
- public final static Logger LOG = LogFormatter.getLogger(LanguageIdentifier.class.getName());
-
+ private final static Logger LOG =
+ LogFormatter.getLogger(LanguageIdentifier.class.getName());
private ArrayList languages = new ArrayList();
@@ -168,7 +173,8 @@
}
/**
- * return handle to singleton instance
+ * Get a LanguageIdentifier instance.
+ * @return the LanguageIdentifier singleton instance.
*/
public static LanguageIdentifier getInstance() {
if (identifier == null) {
@@ -182,13 +188,25 @@
}
/**
- * main method used for testing
- *
- * @param args
+ * Main method used for command line process.
+ * <br/>Usage is:
+ * <pre>
+ * LanguageIdentifier [-identifyrows filename maxlines]
+ * [-identifyfile charset filename]
+ * [-identifyfileset charset files]
+ * [-identifytext text]
+ * [-identifyurl url]
+ * </pre>
+ * @param args arguments.
*/
public static void main(String args[]) {
- String usage = "Usage: LanguageIdentifier [-identifyrows filename maxlines] [-identifyfile filename] [-identifyfileset files] [-identifytext text] [-identifyurl url]";
+ String usage = "Usage: LanguageIdentifier " +
+ "[-identifyrows filename maxlines] " +
+ "[-identifyfile charset filename] " +
+ "[-identifyfileset charset files] " +
+ "[-identifytext text] " +
+ "[-identifyurl url]";
int command = 0;
final int IDFILE = 1;
@@ -199,6 +217,7 @@
Vector fileset = new Vector();
String filename = "";
+ String charset = "";
String url = "";
String text = "";
int max = 0;
@@ -211,6 +230,7 @@
for (int i = 0; i < args.length; i++) { // parse command line
if (args[i].equals("-identifyfile")) {
command = IDFILE;
+ charset = args[++i];
filename = args[++i];
}
@@ -233,6 +253,7 @@
if (args[i].equals("-identifyfileset")) {
command = IDFILESET;
+ charset = args[++i];
for (i++; i < args.length; i++) {
File[] files = null;
File f = new File(args[i]);
@@ -264,7 +285,7 @@
case IDFILE:
f = new File(filename);
fis = new FileInputStream(f);
- lang = idfr.identify(fis);
+ lang = idfr.identify(fis, charset);
fis.close();
break;
@@ -302,7 +323,7 @@
filename = (String) i.next();
f = new File(filename);
fis = new FileInputStream(f);
- lang = idfr.identify(fis);
+ lang = idfr.identify(fis, charset);
fis.close();
} catch (Exception e) {
System.out.println(e);
@@ -349,22 +370,26 @@
}
/**
- * Identify language based on submitted content
+ * Identify language of a content.
*
- * @param text to analyze
- * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
- * unknown
+ * @param content is the content to analyze.
+ * @return The 2 letter
+ * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+ * language code</a> (en, fi, sv, ...) of the language that best
+ * matches the specified content.
*/
- public String identify(String text) {
- return identify(new StringBuffer(text));
+ public String identify(String content) {
+ return identify(new StringBuffer(content));
}
/**
- * Identify language based on submitted content
+ * Identify language of a content.
*
- * @param text to analyze
- * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
- * unknown
+ * @param content is the content to analyze.
+ * @return The 2 letter
+ * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+ * language code</a> (en, fi, sv, ...) of the language that best
+ * matches the specified content.
*/
public String identify(StringBuffer content) {
@@ -405,26 +430,48 @@
}
/**
- * Identify language from inputstream
- *
- * @param is
- * @return language code
- * @throws IOException
+ * Identify language from input stream.
+ * This method uses the platform default encoding to read the input stream.
+ * For using a specific encoding, use the
+ * {@link #identify(InputStream, String)} method.
+ *
+ * @param is is the input stream to analyze.
+ * @return The 2 letter
+ * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+ * language code</a> (en, fi, sv, ...) of the language that best
+ * matches the content of the specified input stream.
+ * @throws IOException if something wrong occurs on the input stream.
*/
public String identify(InputStream is) throws IOException {
+ return identify(is, null);
+ }
+
+ /**
+ * Identify language from input stream.
+ *
+ * @param is is the input stream to analyze.
+ * @param charset is the charset to use to read the input stream.
+ * @return The 2 letter
+ * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+ * language code</a> (en, fi, sv, ...) of the language that best
+ * matches the content of the specified input stream.
+ * @throws IOException if something wrong occurs on the input stream.
+ */
+ public String identify(InputStream is, String charset) throws IOException {
- StringBuffer text = new StringBuffer();
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
byte[] buffer = new byte[2048];
int len = 0;
while (((len = is.read(buffer)) != -1) &&
- ((analyzeLength == 0) || (text.length() < analyzeLength))) {
+ ((analyzeLength == 0) || (out.size() < analyzeLength))) {
if (analyzeLength != 0) {
- len = Math.min(len, analyzeLength - text.length());
+ len = Math.min(len, analyzeLength - out.size());
}
- text.append(new String(buffer, 0, len));
+ out.write(buffer, 0, len);
}
- return identify(text);
+ return identify((charset == null) ? out.toString()
+ : out.toString(charset));
}
}
Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties Thu Sep 8 12:42:44 2005
@@ -1,187 +1,188 @@
-aa=aar
-ab=abk
+# Defines some mapping between common erroneous languages codes and
+# the ISO 639 two-letters language codes.
+aa=aar,Afar
+ab=abk,Abkhazian
ae=ave
-af=afr
+af=afr,Afrikaans
ak=aka
-am=amh
+am=amh,Amharic
an=arg
-ar=ara
-as=asm
+ar=ara,Arabic
+as=asm,Assamese
av=ava
-ay=aym
-az=aze
-ba=bak
-be=bel
-bg=bul
-bh=bih
-bi=bis
+ay=aym,Aymara
+az=aze,Azerbaijani
+ba=bak,Bashkir
+be=bel,Byelorussian
+bg=bul,Bulgarian
+bh=bih,Bihari
+bi=bis,Bislama
bm=bam
-bn=ben
-bo=tib/bod
-br=bre
+bn=ben,Bengali
+bo=bod,tib,Tibetan
+br=bre,Breton
bs=bos
-ca=cat
+ca=cat,Catalan
ce=che
ch=cha
-co=cos
+co=cos,Corsican
cr=cre
-cs=cze/ces
+cs=ces,cze,Czech
cu=chu
cv=chv
-cy=wel/cym
-da=dan
-de=ger/deu
+cy=cym,wel,Welsh
+da=dan,Danish
+de=deu,ger,German
dv=div
-dz=dzo
+dz=dzo,Dzongkha
ee=ewe
-el=gre/ell
-en=eng
-eo=epo
-es=spa
-et=est
-eu=baq/eus
-fa=per/fas
+el=ell,gre,Greek
+en=eng,English
+eo=epo,Esperanto
+es=esl,spa,Spanish
+et=est,Estonian
+eu=baq,eus,Basque
+fa=fas,per,Persian
ff=ful
-fi=fin
-fj=fij
-fo=fao
-fr=fre/fra
-fy=fry
-ga=gle
+fi=fin,Finnish
+fj=fij,Fijian
+fo=fao,Faroese
+fr=fra,fre,French
+fy=fry,Frisian
+ga=gai,iri,Irish
gd=gla
-gl=glg
-gn=grn
-gu=guj
+gl=glg,Gallegan
+gn=grn,Guarani
+gu=guj,Gujarati
gv=glv
-ha=hau
-he=heb
-hi=hin
+ha=hau,Hausa
+he=heb,Hebrew
+hi=hin,Hindi
ho=hmo
-hr=scr/hrv
+hr=scr,hrv,Croatian
ht=hat
-hu=hun
-hy=arm/hye
+hu=hun,Hungarian
+hy=arm,hye,Armenian
hz=her
-ia=ina
-id=ind
+ia=ina,Interlingua
+id=ind,Indonesian
ie=ile
ig=ibo
ii=iii
-ik=ipk
+ik=ipk,Inupiak
io=ido
-is=ice/isl
-it=ita
-iu=iku
-ja=jpn
-jv=jav
-ka=geo/kat
+is=ice,isl,Icelandic
+it=ita,Italian
+iu=iku,Inuktitut
+ja=jpn,Japanese
+jv=jw,jav,jaw,Javanese
+ka=geo,kat,Georgian
kg=kon
ki=kik
-ki=kik
-kj=kua
kj=kua
-kk=kaz
-kl=kal
-km=khm
-kn=kan
-ko=kor
+kk=kaz,Kazakh
+kl=kal,Greenlandic
+km=khm,Khmer
+kn=kan,Kannada
+ko=kor,Korean
kr=kau
-ks=kas
-ku=kur
+ks=kas,Kashmiri
+ku=kur,Kurdish
kv=kom
kw=cor
-ky=kir
-la=lat
+ky=kir,Kirghiz
+la=lat,Latin
lb=ltz
lg=lug
li=lim
-ln=lin
-lo=lao
-lt=lit
+ln=lin,Lingala
+lo=lao,Lao
+lt=lit,Lithuanian
lu=lub
-lv=lav
-mg=mlg
+lv=lav,Latvian
+mg=mlg,Malagasy
mh=mah
-mi=mao/mri
-mk=mac/mkd
-ml=mal
-mn=mon
-mo=mol
-mr=mar
-ms=may/msa
+mi=mao,mri,Maori
+mk=mac,mak,Macedonian
+ml=mal,mlt,Maltese
+mn=mon,Mongolian
+mo=mol,Moldavian
+mr=mar,Marathi
+ms=may,msa,Malay
mt=mlt
-my=bur/mya
-na=nau
+my=bur,mya,Burmese
+na=nau,Nauru
nb=nob
nd=nde
-ne=nep
+ne=nep,Nepali
ng=ndo
-nl=dut/nld
+nl=dut,nla,Dutch
nn=nno
-no=nor
+no=nor,Norwegian
nr=nbl
nv=nav
ny=nya
-oc=oci
+oc=oci,Langue d'Oc
oj=oji
-om=orm
-or=ori
+om=orm,Oromo
+or=ori,Oriya
os=oss
-pa=pan
+pa=pan,Panjabi
pi=pli
-pl=pol
-ps=pus
-pt=por
-qu=que
-rm=roh
-rn=run
-ro=rum/ron
-ru=rus
-rw=kin
-sa=san
+pl=pol,Polish
+ps=pus,Pushto
+pt=por,Portuguese
+qu=que,Quechua
+rm=roh,Rhaeto-Romance
+rn=run,Rundi
+ro=ron,rum,Romanian
+ru=rus,Russian
+rw=kin,Kinyarwanda
+sa=san,Sanskrit
sc=srd
-sd=snd
+sd=snd,Sindhi
se=sme
-sg=sag
-si=sin
-sk=slo/slk
-sl=slv
-sm=smo
-sn=sna
-so=som
-sq=alb/sqi
-sr=scc/srp
-ss=ssw
-st=sot
-su=sun
-sv=swe
-sw=swa
-ta=tam
-te=tel
-tg=tgk
-th=tha
-ti=tir
-tk=tuk
-tl=tgl
-tn=tsn
-to=ton
-tr=tur
-ts=tso
-tt=tat
-tw=twi
+sg=sag,Sango
+sh=scr,Serbo-Croatian
+si=sin,Singhalese
+sk=slk,slo,Slovak
+sl=slv,Slovenian
+sm=smo,Samoan
+sn=sna,Shona
+so=som,Somali
+sq=alb,sqi,Albanian
+sr=scc,srp,Serbian
+ss=ssw,Siswant
+st=sot,Sotho
+su=sun,Sudanese
+sv=sve,swe,Swedish,Svenska,Sweden
+sw=swa,Swahili
+ta=tam,Tamil
+te=tel,Telugu
+tg=tgk,Tajik
+th=tha,Thai
+ti=tir,Tigrinya
+tk=tuk,Turkmen
+tl=tgl,Tagalog
+tn=tsn,Tswana
+to=tog,Tonga
+tr=tur,Turkish
+ts=tso,Tsonga
+tt=tat,Tatar
+tw=twi,Twi
ty=tah
-ug=uig
-uk=ukr
-ur=urd
-uz=uzb
+ug=uig,Uighur
+uk=ukr,Ukrainian
+ur=urd,Urdu
+uz=uzb,Uzbek
ve=ven
-vi=vie
-vo=vol
+vi=vie,Vietnamese
+vo=vol,Volapk
wa=wln
-wo=wol
-xh=xho
-yi=yid
-yo=yor
-za=zha
-zh=chi/zho
-zu=zul
+wo=wol,Wolof
+xh=xho,Xhosa
+yi=yidYiddish
+yo=yor,Yoruba
+za=zha,Zhuang
+zh=chi,zho,Chinese
+zu=zul,Zulu
Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Thu Sep 8 12:42:44 2005
@@ -15,14 +15,19 @@
*/
package org.apache.nutch.analysis.lang;
+// JDK imports
import java.util.Properties;
+// JUnit imports
import junit.framework.TestCase;
+
+// Nutch imports
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.protocol.Content;
+
public class TestHTMLLanguageParser extends TestCase {
private static String URL = "http://foo.bar/";
@@ -61,6 +66,61 @@
}
+ /** Test of <code>LanguageParser.parseLanguage(String)</code> method. */
+ public void testParseLanguage() {
+ String tests[][] = {
+ { "(SCHEME=ISO.639-1) sv", "sv" },
+ { "(SCHEME=RFC1766) sv-FI", "sv" },
+ { "(SCHEME=Z39.53) SWE", "sv" },
+ { "EN_US, SV, EN, EN_UK", "en" },
+ { "English Swedish", "en" },
+ { "English, swedish", "en" },
+ { "English,Swedish", "en" },
+ { "Other (Svenska)", "sv" },
+ { "SE", "se" },
+ { "SV", "sv" },
+ { "SV charset=iso-8859-1", "sv" },
+ { "SV-FI", "sv" },
+ { "SV; charset=iso-8859-1", "sv" },
+ { "SVE", "sv" },
+ { "SW", "sw" },
+ { "SWE", "sv" },
+ { "SWEDISH", "sv" },
+ { "Sv", "sv" },
+ { "Sve", "sv" },
+ { "Svenska", "sv" },
+ { "Swedish", "sv" },
+ { "Swedish, svenska", "sv" },
+ { "en, sv", "en" },
+ { "sv", "sv" },
+ { "sv, be, dk, de, fr, no, pt, ch, fi, en", "sv" },
+ { "sv,en", "sv" },
+ { "sv-FI", "sv" },
+ { "sv-SE", "sv" },
+ { "sv-en", "sv" },
+ { "sv-fi", "sv" },
+ { "sv-se", "sv" },
+ { "sv; Content-Language: sv", "sv" },
+ { "sv_SE", "sv" },
+ { "sve", "sv" },
+ { "svenska, swedish, engelska, english", "sv" },
+ { "sw", "sw" },
+ { "swe", "sv" },
+ { "swe.SPR.", "sv" },
+ { "sweden", "sv" },
+ { "swedish", "sv" },
+ { "swedish,", "sv" },
+ { "text/html; charset=sv-SE", "sv" },
+ { "text/html; sv", "sv" },
+ { "torp, stuga, uthyres, bed & breakfast", null }
+ };
+
+ for (int i=0; i<44; i++) {
+ assertEquals(tests[i][1], HTMLLanguageParser.LanguageParser.parseLanguage(tests[i][0]));
+ }
+ }
+
+
private Content getContent(String text) {
Properties p = new Properties();
p.put("Content-Type", "text/html");
@@ -68,4 +128,5 @@
Content content = new Content(URL, BASE, text.getBytes(), "text/html", p);
return content;
}
+
}
Modified: lucene/nutch/branches/mapred/src/plugin/ontology/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/ontology/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/ontology/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/ontology/plugin.xml Thu Sep 8 12:42:44 2005
@@ -6,9 +6,6 @@
version="1.0.0"
provider-name="nutch.org">
- <extension-point
- id="org.apache.nutch.ontology.Ontology"
- name="Ontology Model Loader"/>
<runtime>
<library name="ontology.jar">
Modified: lucene/nutch/branches/mapred/src/plugin/parse-ext/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-ext/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-ext/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-ext/plugin.xml Thu Sep 8 12:42:44 2005
@@ -5,9 +5,7 @@
version="1.0.0"
provider-name="nutch.org">
- <extension-point
- id="org.apache.nutch.parse.Parser"
- name="Nutch Content Parser"/>
+
<runtime>
<library name="parse-ext.jar">
Modified: lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Thu Sep 8 12:42:44 2005
@@ -23,6 +23,7 @@
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.CommandRunner;
@@ -58,7 +59,7 @@
// set TYPE_PARAMS_MAP using plugin.xml of this plugin
static {
Extension[] extensions = PluginRepository.getInstance()
- .getExtensionPoint("org.apache.nutch.parse.Parser").getExtentens();
+ .getExtensionPoint("org.apache.nutch.parse.Parser").getExtensions();
String contentType, command, timeoutString;
@@ -66,7 +67,7 @@
Extension extension = extensions[i];
// only look for extensions defined by plugin parse-ext
- if (!extension.getDiscriptor().getPluginId().equals("parse-ext"))
+ if (!extension.getDescriptor().getPluginId().equals("parse-ext"))
continue;
contentType = extension.getAttribute("contentType");
@@ -151,7 +152,7 @@
title = "";
// collect outlink
- Outlink[] outlinks = new Outlink[0];
+ Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
// collect meta data
Properties metaData = new Properties();
Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-html/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-html/plugin.xml Thu Sep 8 12:42:44 2005
@@ -5,13 +5,7 @@
version="1.0.0"
provider-name="nutch.org">
- <extension-point
- id="org.apache.nutch.parse.Parser"
- name="Nutch Content Parser"/>
- <extension-point
- id="org.apache.nutch.parse.HtmlParseFilter"
- name="HTML Parse Filter"/>
<runtime>
<library name="parse-html.jar">
Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java Thu Sep 8 12:42:44 2005
@@ -42,7 +42,6 @@
* This class takes SAX events (in addition to some extra events
* that SAX doesn't handle yet) and adds the result to a document
* or document fragment.
- * @xsl.usage general
*/
public class DOMBuilder
implements ContentHandler, LexicalHandler
Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java Thu Sep 8 12:42:44 2005
@@ -27,7 +27,6 @@
/**
* Class used to verify whether the specified <var>ch</var>
* conforms to the XML 1.0 definition of whitespace.
- * @xsl.usage internal
*/
public class XMLCharacterRecognizer
{
@@ -90,7 +89,7 @@
/**
* Tell if the string is whitespace.
*
- * @param buf StringBuffer to check as XML whitespace.
+ * @param s String to check as XML whitespace.
* @return True if characters in buffer are XML whitespace, false otherwise
*/
public static boolean isWhiteSpace(String s)
Modified: lucene/nutch/branches/mapred/src/plugin/parse-js/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-js/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-js/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-js/plugin.xml Thu Sep 8 12:42:44 2005
@@ -5,14 +5,6 @@
version="1.0.0"
provider-name="nutch.org">
- <extension-point
- id="org.apache.nutch.parse.Parser"
- name="Nutch Content Parser"/>
-
- <extension-point
- id="org.apache.nutch.parse.HtmlParseFilter"
- name="HTML Parse Filter"/>
-
<runtime>
<library name="parse-js.jar">
<export name="*"/>
@@ -23,17 +15,17 @@
name="JS Parser"
point="org.apache.nutch.parse.Parser">
<implementation id="JSParser"
- class="org.apache.nutch.parse.js.JSParseFilter"
- contentType="application/x-javascript"
- pathSuffix="js"/>
+ class="org.apache.nutch.parse.js.JSParseFilter"
+ contentType="application/x-javascript"
+ pathSuffix="js"/>
</extension>
<extension id="org.apache.nutch.parse.js.JSParseFilter"
name="Parse JS Filter"
point="org.apache.nutch.parse.HtmlParseFilter">
<implementation id="JSParseFilter"
- class="org.apache.nutch.parse.js.JSParseFilter"
- contentType="application/x-javascript"
- pathSuffix=""/>
+ class="org.apache.nutch.parse.js.JSParseFilter"
+ contentType="application/x-javascript"
+ pathSuffix=""/>
</extension>
</plugin>
Modified: lucene/nutch/branches/mapred/src/plugin/parse-mp3/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-mp3/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-mp3/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-mp3/plugin.xml Thu Sep 8 12:42:44 2005
@@ -1,13 +1,25 @@
<?xml version = '1.0' encoding = 'UTF-8'?>
-<plugin version="1.0.0" provider-name="nutch.org" id="parse-mp3" name="MP3 Parse Plug-in" >
- <extension-point id="org.apache.nutch.parse.Parser" name="Nutch Content Parser" />
- <runtime>
- <library name="parse-mp3.jar" >
- <export name="*" />
- </library>
- <library name="jid3lib-0.5.1.jar" />
- </runtime>
- <extension point="org.apache.nutch.parse.Parser" id="org.apache.nutch.parse.mp3" name="MP3Parse" >
- <implementation class="org.apache.nutch.parse.mp3.MP3Parser" pathSuffix="mp3" id="org.apache.nutch.parse.mp3.MP3Parser" contentType="audio/mpeg" />
- </extension>
+<plugin
+ version="1.0.0"
+ provider-name="nutch.org"
+ id="parse-mp3"
+ name="MP3 Parse Plug-in">
+
+ <runtime>
+ <library name="parse-mp3.jar">
+ <export name="*"/>
+ </library>
+ <library name="jid3lib-0.5.1.jar"/>
+ </runtime>
+
+ <extension point="org.apache.nutch.parse.Parser"
+ id="org.apache.nutch.parse.mp3"
+ name="MP3Parse">
+
+ <implementation class="org.apache.nutch.parse.mp3.MP3Parser"
+ pathSuffix="mp3"
+ id="org.apache.nutch.parse.mp3.MP3Parser"
+ contentType="audio/mpeg"/>
+ </extension>
+
</plugin>
Modified: lucene/nutch/branches/mapred/src/plugin/parse-msword/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-msword/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-msword/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-msword/plugin.xml Thu Sep 8 12:42:44 2005
@@ -5,16 +5,6 @@
version="1.0.0"
provider-name="nutch.org">
- <extension-point
- id="org.apache.nutch.parse.Parser"
- name="Nutch Content Parser"/>
-
- <!--
- <extension-point
- id="org.apache.nutch.parse.MSWordParseFilter"
- name="MSWord Parse Filter"/>
- -->
-
<runtime>
<library name="parse-msword.jar">
<export name="*"/>
Modified: lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java Thu Sep 8 12:42:44 2005
@@ -24,6 +24,7 @@
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
import org.apache.nutch.parse.ParseException;
import java.util.Properties;
@@ -117,7 +118,7 @@
title = "";
// collect outlink
- Outlink[] outlinks = new Outlink[0];
+ Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata);
return new ParseImpl(text, parseData);
Modified: lucene/nutch/branches/mapred/src/plugin/parse-pdf/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-pdf/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-pdf/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-pdf/plugin.xml Thu Sep 8 12:42:44 2005
@@ -5,15 +5,6 @@
version="1.0.0"
provider-name="nutch.org">
- <extension-point
- id="org.apache.nutch.parse.Parser"
- name="Nutch Content Parser"/>
-
- <!--
- <extension-point
- id="org.apache.nutch.parse.PdfParseFilter"
- name="PDF Parse Filter"/>
- -->
<runtime>
<library name="parse-pdf.jar">
Modified: lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java Thu Sep 8 12:42:44 2005
@@ -33,6 +33,7 @@
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
import org.apache.nutch.parse.ParseException;
import java.text.SimpleDateFormat;
@@ -161,7 +162,7 @@
title = "";
// collect outlink
- Outlink[] outlinks = new Outlink[0];
+ Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
// collect meta data
Properties metadata = new Properties();
Modified: lucene/nutch/branches/mapred/src/plugin/parse-rtf/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-rtf/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-rtf/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-rtf/plugin.xml Thu Sep 8 12:42:44 2005
@@ -1,13 +1,23 @@
<?xml version = '1.0' encoding = 'UTF-8'?>
-<plugin version="1.0.0" provider-name="nutch.org" id="parse-rtf" name="RTF Parse Plug-in" >
- <extension-point id="org.apache.nutch.parse.Parser" name="Nutch Content Parser" />
- <runtime>
- <library name="parse-rtf.jar" >
- <export name="*" />
- </library>
- <library name="rtf-parser.jar"/>
- </runtime>
- <extension point="org.apache.nutch.parse.Parser" id="org.apache.nutch.parse.rtf" name="RTFParse" >
- <implementation class="org.apache.nutch.parse.rtf.RTFParseFactory" pathSuffix="rtf" id="org.apache.nutch.parse.rtf.RTFParseFactory" contentType="application/rtf" />
- </extension>
+<plugin
+ version="1.0.0"
+ provider-name="nutch.org"
+ id="parse-rtf"
+ name="RTF Parse Plug-in">
+
+ <runtime>
+ <library name="parse-rtf.jar">
+ <export name="*"/>
+ </library>
+ <library name="rtf-parser.jar"/>
+ </runtime>
+
+ <extension point="org.apache.nutch.parse.Parser"
+ id="org.apache.nutch.parse.rtf"
+ name="RTFParse">
+ <implementation class="org.apache.nutch.parse.rtf.RTFParseFactory"
+ pathSuffix="rtf" id="org.apache.nutch.parse.rtf.RTFParseFactory"
+ contentType="application/rtf"/>
+ </extension>
+
</plugin>
Modified: lucene/nutch/branches/mapred/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java Thu Sep 8 12:42:44 2005
@@ -59,9 +59,12 @@
title = "";
}
- ParseData parseData = new ParseData(title, new Outlink[0], metadata);
+ String text = delegate.getText();
- return new ParseImpl(delegate.getText(), parseData);
+ return new ParseImpl(text,
+ new ParseData(title,
+ OutlinkExtractor.getOutlinks(text),
+ metadata));
}
Modified: lucene/nutch/branches/mapred/src/plugin/parse-text/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-text/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-text/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-text/plugin.xml Thu Sep 8 12:42:44 2005
@@ -5,9 +5,6 @@
version="1.0.0"
provider-name="nutch.org">
- <extension-point
- id="org.apache.nutch.parse.Parser"
- name="Nutch Content Parser"/>
<runtime>
<library name="parse-text.jar">
Modified: lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Thu Sep 8 12:42:44 2005
@@ -28,7 +28,7 @@
Properties metadata = new Properties();
metadata.putAll(content.getMetadata());
- ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new Outlink[0], metadata);
+ //ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new Outlink[0], metadata);
String encoding =
StringUtil.parseCharacterEncoding(content.getContentType());
@@ -45,6 +45,9 @@
text = new String(content.getContent()); // use default encoding
}
- return new ParseImpl(text, parseData);
+ return new ParseImpl(text,
+ new ParseData(ParseStatus.STATUS_SUCCESS, "",
+ OutlinkExtractor.getOutlinks(text),
+ metadata));
}
}
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-file/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-file/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-file/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-file/plugin.xml Thu Sep 8 12:42:44 2005
@@ -5,9 +5,6 @@
version="1.0.0"
provider-name="nutch.org">
- <extension-point
- id="org.apache.nutch.protocol.Protocol"
- name="Nutch Protocol"/>
<runtime>
<library name="protocol-file.jar">
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-ftp/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-ftp/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-ftp/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-ftp/plugin.xml Thu Sep 8 12:42:44 2005
@@ -5,9 +5,7 @@
version="1.0.0"
provider-name="nutch.org">
- <extension-point
- id="org.apache.nutch.protocol.Protocol"
- name="Nutch Protocol"/>
+
<runtime>
<library name="protocol-ftp.jar">
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-http/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-http/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-http/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-http/plugin.xml Thu Sep 8 12:42:44 2005
@@ -5,10 +5,6 @@
version="1.0.0"
provider-name="nutch.org">
- <extension-point
- id="org.apache.nutch.protocol.Protocol"
- name="Nutch Protocol"/>
-
<runtime>
<library name="protocol-http.jar">
<export name="*"/>
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/plugin.xml Thu Sep 8 12:42:44 2005
@@ -5,21 +5,16 @@
version="1.0.0"
provider-name="nutch.org">
- <extension-point
- id="org.apache.nutch.protocol.Protocol"
- name="Nutch Protocol"/>
-
<runtime>
<library name="protocol-httpclient.jar">
<export name="*"/>
</library>
<library name="commons-codec.jar" />
<library name="commons-httpclient-3.0-rc2.jar" />
-
</runtime>
<extension id="org.apache.nutch.protocol.httpclient"
- name="HttpProtocol"
+ name="HttpProtocol"
point="org.apache.nutch.protocol.Protocol">
<implementation id="org.apache.nutch.protocol.httpclient.Http"
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java Thu Sep 8 12:42:44 2005
@@ -18,14 +18,14 @@
import org.apache.commons.httpclient.HttpClientError;
import org.apache.commons.httpclient.params.HttpConnectionParams;
import org.apache.commons.httpclient.protocol.ControllerThreadSocketFactory;
-import org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory;
+import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.sun.net.ssl.SSLContext;
import com.sun.net.ssl.TrustManager;
-public class DummySSLProtocolSocketFactory implements SecureProtocolSocketFactory {
+public class DummySSLProtocolSocketFactory implements ProtocolSocketFactory {
/** Log object for this class. */
private static final Log LOG = LogFactory.getLog(DummySSLProtocolSocketFactory.class);
@@ -58,7 +58,7 @@
}
/**
- * @see SecureProtocolSocketFactory#createSocket(java.lang.String,int,java.net.InetAddress,int)
+ * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int,InetAddress,int)
*/
public Socket createSocket(String host, int port, InetAddress clientHost, int clientPort) throws IOException,
UnknownHostException {
@@ -79,8 +79,8 @@
*
* @param host the host name/IP
* @param port the port on the host
- * @param clientHost the local host name/IP to bind the socket to
- * @param clientPort the port on the local machine
+ * @param localAddress the local host name/IP to bind the socket to
+ * @param localPort the port on the local machine
* @param params {@link HttpConnectionParams Http connection parameters}
*
* @return Socket a new socket
@@ -104,14 +104,14 @@
}
/**
- * @see SecureProtocolSocketFactory#createSocket(java.lang.String,int)
+ * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int)
*/
public Socket createSocket(String host, int port) throws IOException, UnknownHostException {
return getSSLContext().getSocketFactory().createSocket(host, port);
}
/**
- * @see SecureProtocolSocketFactory#createSocket(java.net.Socket,java.lang.String,int,boolean)
+ * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(Socket,String,int,boolean)
*/
public Socket createSocket(Socket socket, String host, int port, boolean autoClose) throws IOException,
UnknownHostException {
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Thu Sep 8 12:42:44 2005
@@ -9,18 +9,16 @@
import java.net.UnknownHostException;
import java.util.HashMap;
import java.util.LinkedList;
-import java.util.Properties;
import java.util.logging.Level;
import java.util.logging.Logger;
-import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.Credentials;
import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
-import org.apache.commons.httpclient.HttpState;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.NTCredentials;
-import org.apache.commons.httpclient.params.HttpConnectionParams;
+import org.apache.commons.httpclient.auth.AuthScope;
+import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.apache.commons.httpclient.protocol.Protocol;
import org.apache.nutch.db.Page;
import org.apache.nutch.pagedb.FetchListEntry;
@@ -47,7 +45,6 @@
}
static final int BUFFER_SIZE = 8 * 1024;
- private static final int MAX_REDIRECTS = NutchConf.get().getInt("http.redirect.max", 3);
private static MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
private static HttpClient client;
@@ -102,8 +99,6 @@
*/
private static LinkedList BLOCKED_ADDR_QUEUE = new LinkedList();
- private RobotRulesParser robotRules = new RobotRulesParser();
-
private static InetAddress blockAddr(URL url) throws ProtocolException {
InetAddress addr;
try {
@@ -183,7 +178,6 @@
}
public ProtocolOutput getProtocolOutput(String urlString) {
- ProtocolOutput output = null;
try {
return getProtocolOutput(new FetchListEntry(true, new Page(urlString, 1.0f), new String[0]));
} catch (MalformedURLException mue) {
@@ -196,9 +190,6 @@
try {
URL url = new URL(urlString);
- int redirects = 0;
- HttpAuthentication auth = null;
- while (true) {
try {
if (!RobotRulesParser.isAllowed(url))
return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url));
@@ -210,7 +201,7 @@
InetAddress addr = blockAddr(url);
HttpResponse response;
try {
- response = new HttpResponse(urlString, url); // make a request
+ response = new HttpResponse(url); // make a request
} finally {
unblockAddr(addr);
}
@@ -255,19 +246,10 @@
} else if (code == 400) { // bad request, mark as GONE
LOG.fine("400 Bad request: " + url);
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, url));
- } else if (code == 401) { // requires authorization
+ } else if (code == 401) { // requires authorization, but no valid auth provided.
LOG.fine("401 Authentication Required");
- if (redirects == MAX_REDIRECTS)
- return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.REDIR_EXCEEDED,
- "Too many redirects: " + urlString));
- Properties p = c.getMetadata();
- if (p instanceof MultiProperties) {
- auth = HttpAuthenticationFactory.findAuthentication((MultiProperties) p);
- } else {
- return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authorization required: "
- + urlString));
- }
- redirects++;
+ return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
+ + urlString));
} else if (code == 404) {
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, url));
} else if (code == 410) { // permanently GONE
@@ -276,7 +258,6 @@
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
+ url));
}
- }
} catch (Throwable e) {
e.printStackTrace();
return new ProtocolOutput(null, new ProtocolStatus(e));
@@ -371,55 +352,33 @@
// get a client isntance -- we just need one.
client = new HttpClient(connectionManager);
- // this is just to add logging, whenever cookies are added.
- client.setState(new NutchHttpState());
// Set up an HTTPS socket factory that accepts self-signed certs.
Protocol dummyhttps = new Protocol("https", new DummySSLProtocolSocketFactory(), 443);
Protocol.registerProtocol("https", dummyhttps);
- // set up the connection manager
- // hardcoded for now
-
- connectionManager.setMaxTotalConnections(MAX_THREADS_TOTAL);
- //if (MAX_THREADS_TOTAL > MAX_THREADS_PER_HOST) {
- // connectionManager.setMaxConnectionsPerHost(MAX_THREADS_PER_HOST);
- //} else {
- // connectionManager.setMaxConnectionsPerHost(MAX_THREADS_TOTAL);
- //}
-
- HttpConnectionParams params = connectionManager.getParams();
+ HttpConnectionManagerParams params = connectionManager.getParams();
params.setConnectionTimeout(TIMEOUT);
params.setSoTimeout(TIMEOUT);
params.setSendBufferSize(BUFFER_SIZE);
params.setReceiveBufferSize(BUFFER_SIZE);
+ params.setMaxTotalConnections(MAX_THREADS_TOTAL);
+ if (MAX_THREADS_TOTAL > MAX_THREADS_PER_HOST) {
+ params.setDefaultMaxConnectionsPerHost(MAX_THREADS_PER_HOST);
+ } else {
+ params.setDefaultMaxConnectionsPerHost(MAX_THREADS_TOTAL);
+ }
+
HostConfiguration hostConf = client.getHostConfiguration();
if (PROXY) {
hostConf.setProxy(PROXY_HOST, PROXY_PORT);
}
if (NTLM_USERNAME.length() > 0) {
Credentials ntCreds = new NTCredentials(NTLM_USERNAME, NTLM_PASSWORD, NTLM_HOST, NTLM_DOMAIN);
- client.getState().setCredentials(null, null, ntCreds);
+ client.getState().setCredentials(new AuthScope(NTLM_HOST, AuthScope.ANY_PORT), ntCreds);
LOG.info("Added NTLM credentials for " + NTLM_USERNAME);
}
LOG.info("Configured Client");
}
-}
-
-class NutchHttpState extends HttpState {
- public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.net.Http.NutchHttpState");
-
- public void addCookie(Cookie cookie) {
- LOG.fine(" - setting cookie: " + cookie);
- super.addCookie(cookie);
- }
-
- public void addCookies(Cookie[] cookies) {
- LOG.fine(" - setting cookies: ");
- for (int i = 0; i < cookies.length; i++)
- LOG.fine(" cookie: " + cookies[i]);
-
- super.addCookies(cookies);
- }
-}
+}
\ No newline at end of file
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Thu Sep 8 12:42:44 2005
@@ -4,24 +4,34 @@
package org.apache.nutch.protocol.httpclient;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.mime.MimeType;
+import org.apache.nutch.util.mime.MimeTypes;
import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpVersion;
+import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.commons.httpclient.params.HttpMethodParams;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
-import java.util.Properties;
-import java.util.List;
-import java.util.ListIterator;
/**
* An HTTP response.
*/
public class HttpResponse {
+ /** A flag that tells if magic resolution must be performed */
+ private final static boolean MAGIC =
+ NutchConf.get().getBoolean("mime.type.magic", true);
+
+ /** Get the MimeTypes resolver instance. */
+ private final static MimeTypes MIME =
+ MimeTypes.get(NutchConf.get().get("mime.types.file"));
+
private String orig;
private String base;
@@ -54,24 +64,40 @@
public Content toContent() {
String contentType = getHeader("Content-Type");
- if (contentType == null) contentType = "";
+ if (contentType == null) {
+ MimeType type = null;
+ if (MAGIC) {
+ type = MIME.getMimeType(orig, content);
+ } else {
+ type = MIME.getMimeType(orig);
+ }
+ if (type != null) {
+ contentType = type.getName();
+ } else {
+ contentType = "";
+ }
+ }
if (content == null) content = EMPTY_CONTENT;
return new Content(orig, base, content, contentType, headers);
}
- public HttpResponse(URL url) throws ProtocolException, IOException {
- this(url.toString(), url);
- }
-
- public HttpResponse(String orig, URL url) throws IOException {
- this.orig = orig;
+ public HttpResponse(URL url) throws IOException {
this.base = url.toString();
- GetMethod get = new GetMethod(url.toString());
+ this.orig = url.toString();
+ GetMethod get = new GetMethod(this.orig);
get.setFollowRedirects(false);
- get.setStrictMode(false);
get.setRequestHeader("User-Agent", Http.AGENT_STRING);
- get.setHttp11(false);
- get.setMethodRetryHandler(null);
+ HttpMethodParams params = get.getParams();
+ // some servers cannot digest the new protocol
+ params.setVersion(HttpVersion.HTTP_1_0);
+ params.makeLenient();
+ params.setContentCharset("UTF-8");
+ params.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
+ params.setBooleanParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, true);
+ // XXX (ab) not sure about this... the default is to retry 3 times; if
+ // XXX the request body was sent the method is not retried, so there is
+ // XXX little danger in retrying...
+ // params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
try {
code = Http.getClient().executeMethod(get);
@@ -103,6 +129,7 @@
}
} catch (org.apache.commons.httpclient.ProtocolException pe) {
pe.printStackTrace();
+ get.releaseConnection();
throw new IOException(pe.toString());
} finally {
get.releaseConnection();
Modified: lucene/nutch/branches/mapred/src/plugin/query-basic/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/query-basic/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/query-basic/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/query-basic/plugin.xml Thu Sep 8 12:42:44 2005
@@ -5,9 +5,7 @@
version="1.0.0"
provider-name="nutch.org">
- <extension-point
- id="org.apache.nutch.searcher.QueryFilter"
- name="Nutch Query Filter"/>
+
<runtime>
<library name="query-basic.jar">
Modified: lucene/nutch/branches/mapred/src/plugin/query-more/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/query-more/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/query-more/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/query-more/plugin.xml Thu Sep 8 12:42:44 2005
@@ -5,9 +5,7 @@
version="1.0.0"
provider-name="nutch.org">
- <extension-point
- id="org.apache.nutch.searcher.QueryFilter"
- name="Nutch Query Filter"/>
+
<runtime>
<library name="query-more.jar">