You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by cm...@apache.org on 2002/06/17 15:59:29 UTC

cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher Fetcher.java FetcherMain.java FetcherTaskQueue.java FetcherThread.java FetcherThreadFactory.java RobotExclusionFilter.java ThreadMonitor.java URLMessage.java URLVisitedFilter.java HostInfo.java HostManager.java

cmarschner    2002/06/17 06:59:29

  Modified:    contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher
                        Fetcher.java FetcherMain.java FetcherTaskQueue.java
                        FetcherThread.java FetcherThreadFactory.java
                        RobotExclusionFilter.java ThreadMonitor.java
                        URLMessage.java URLVisitedFilter.java
  Removed:     contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher
                        HostInfo.java HostManager.java
  Log:
  added URLNormalizer. Changed filters to use normalized URLs if possible
  
  Revision  Changes    Path
  1.4       +2 -1      jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java
  
  Index: Fetcher.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- Fetcher.java	1 Jun 2002 18:55:15 -0000	1.3
  +++ Fetcher.java	17 Jun 2002 13:59:28 -0000	1.4
  @@ -65,6 +65,7 @@
   import java.util.LinkedList;
   
   import de.lanlab.larm.fetcher.FetcherTask;
  +import de.lanlab.larm.net.*;
   
   /**
    * filter class; the Fetcher is the main class which keeps the ThreadPool that
  
  
  
  1.4       +3 -2      jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java
  
  Index: FetcherMain.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- FetcherMain.java	1 Jun 2002 18:55:15 -0000	1.3
  +++ FetcherMain.java	17 Jun 2002 13:59:28 -0000	1.4
  @@ -62,6 +62,7 @@
   import de.lanlab.larm.gui.*;
   import de.lanlab.larm.util.*;
   import de.lanlab.larm.storage.*;
  +import de.lanlab.larm.net.*;
   import javax.swing.UIManager;
   import HTTPClient.*;
   import org.apache.oro.text.regex.MalformedPatternException;
  @@ -278,7 +279,7 @@
       {
           try
           {
  -            messageHandler.putMessage(new URLMessage(url, null, isFrame, null));
  +            messageHandler.putMessage(new URLMessage(url, null, isFrame, null, this.hostManager));
           }
           catch (Exception e)
           {
  
  
  
  1.3       +16 -15    jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTaskQueue.java
  
  Index: FetcherTaskQueue.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTaskQueue.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- FetcherTaskQueue.java	22 May 2002 23:09:17 -0000	1.2
  +++ FetcherTaskQueue.java	17 Jun 2002 13:59:28 -0000	1.3
  @@ -186,16 +186,17 @@
       public static void main(String args[])
       {
           FetcherTaskQueue q = new FetcherTaskQueue();
  +        de.lanlab.larm.net.HostManager hm = new de.lanlab.larm.net.HostManager(10);
           System.out.println("Test 1. put in 4 yahoos and 3 lmus. pull out LMU/Yahoo/LMU/Yahoo/LMU/Yahoo/Yahoo");
           try
           {
  -            q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null)));
  -            q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null)));
  -            q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null)));
  -            q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null)));
  -            q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null)));
  -            q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null)));
  -            q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null)));
  +            q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null, hm)));
  +            q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null, hm)));
  +            q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null, hm)));
  +            q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null, hm)));
  +            q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null, hm)));
  +            q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null, hm)));
  +            q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null, hm)));
           }
           catch (Throwable t)
           {
  @@ -217,9 +218,9 @@
           try
           {
               System.out.println("put 3 lmus.");
  -            q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null)));
  -            q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null)));
  -            q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null)));
  +            q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null, hm)));
  +            q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null, hm)));
  +            q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null, hm)));
               System.out.print("pull out 1st element [lmu/1]: ");
               System.out.println(((FetcherTask) q.remove()).getInfo());
               System.out.println("size now [2]: " + q.size());
  @@ -227,9 +228,9 @@
               System.out.println(((FetcherTask) q.remove()).getInfo());
               System.out.println("size now [1]: " + q.size());
               System.out.println("put in 3 yahoos");
  -            q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null)));
  -            q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null)));
  -            q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null)));
  +            q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null, hm)));
  +            q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null, hm)));
  +            q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null, hm)));
               System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
               System.out.println("Size now [3]: " + q.size());
               System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
  @@ -237,7 +238,7 @@
               System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
               System.out.println("Size now [1]: " + q.size());
               System.out.println("put in another Yahoo");
  -            q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null)));
  +            q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null, hm)));
               System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
               System.out.println("Size now [1]: " + q.size());
               System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
  
  
  
  1.3       +2 -1      jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThread.java
  
  Index: FetcherThread.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThread.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- FetcherThread.java	22 May 2002 23:09:17 -0000	1.2
  +++ FetcherThread.java	17 Jun 2002 13:59:28 -0000	1.3
  @@ -56,6 +56,7 @@
   
   import de.lanlab.larm.threads.ServerThread;
   import de.lanlab.larm.util.State;
  +import de.lanlab.larm.net.HostManager;
   
   /**
    * a server thread for the thread pool that records the number
  
  
  
  1.3       +75 -58    jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThreadFactory.java
  
  Index: FetcherThreadFactory.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThreadFactory.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- FetcherThreadFactory.java	22 May 2002 23:09:17 -0000	1.2
  +++ FetcherThreadFactory.java	17 Jun 2002 13:59:28 -0000	1.3
  @@ -1,64 +1,69 @@
  -/* ====================================================================
  - * The Apache Software License, Version 1.1
  - *
  - * Copyright (c) 2001 The Apache Software Foundation.  All rights
  - * reserved.
  - *
  - * Redistribution and use in source and binary forms, with or without
  - * modification, are permitted provided that the following conditions
  - * are met:
  - *
  - * 1. Redistributions of source code must retain the above copyright
  - *    notice, this list of conditions and the following disclaimer.
  - *
  - * 2. Redistributions in binary form must reproduce the above copyright
  - *    notice, this list of conditions and the following disclaimer in
  - *    the documentation and/or other materials provided with the
  - *    distribution.
  - *
  - * 3. The end-user documentation included with the redistribution,
  - *    if any, must include the following acknowledgment:
  - *       "This product includes software developed by the
  - *        Apache Software Foundation (http://www.apache.org/)."
  - *    Alternately, this acknowledgment may appear in the software itself,
  - *    if and wherever such third-party acknowledgments normally appear.
  - *
  - * 4. The names "Apache" and "Apache Software Foundation" and
  - *    "Apache Lucene" must not be used to endorse or promote products
  - *    derived from this software without prior written permission. For
  - *    written permission, please contact apache@apache.org.
  - *
  - * 5. Products derived from this software may not be called "Apache",
  - *    "Apache Lucene", nor may "Apache" appear in their name, without
  - *    prior written permission of the Apache Software Foundation.
  - *
  - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  - * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  - * SUCH DAMAGE.
  - * ====================================================================
  - *
  - * This software consists of voluntary contributions made by many
  - * individuals on behalf of the Apache Software Foundation.  For more
  - * information on the Apache Software Foundation, please see
  - * <http://www.apache.org/>.
  +/*
  + *  ====================================================================
  + *  The Apache Software License, Version 1.1
  + *
  + *  Copyright (c) 2001 The Apache Software Foundation.  All rights
  + *  reserved.
  + *
  + *  Redistribution and use in source and binary forms, with or without
  + *  modification, are permitted provided that the following conditions
  + *  are met:
  + *
  + *  1. Redistributions of source code must retain the above copyright
  + *  notice, this list of conditions and the following disclaimer.
  + *
  + *  2. Redistributions in binary form must reproduce the above copyright
  + *  notice, this list of conditions and the following disclaimer in
  + *  the documentation and/or other materials provided with the
  + *  distribution.
  + *
  + *  3. The end-user documentation included with the redistribution,
  + *  if any, must include the following acknowledgment:
  + *  "This product includes software developed by the
  + *  Apache Software Foundation (http://www.apache.org/)."
  + *  Alternately, this acknowledgment may appear in the software itself,
  + *  if and wherever such third-party acknowledgments normally appear.
  + *
  + *  4. The names "Apache" and "Apache Software Foundation" and
  + *  "Apache Lucene" must not be used to endorse or promote products
  + *  derived from this software without prior written permission. For
  + *  written permission, please contact apache@apache.org.
  + *
  + *  5. Products derived from this software may not be called "Apache",
  + *  "Apache Lucene", nor may "Apache" appear in their name, without
  + *  prior written permission of the Apache Software Foundation.
  + *
  + *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  + *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  + *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  + *  DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  + *  ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  + *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  + *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  + *  USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  + *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  + *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  + *  OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  + *  SUCH DAMAGE.
  + *  ====================================================================
  + *
  + *  This software consists of voluntary contributions made by many
  + *  individuals on behalf of the Apache Software Foundation.  For more
  + *  information on the Apache Software Foundation, please see
  + *  <http://www.apache.org/>.
    */
  -
   package de.lanlab.larm.fetcher;
   import de.lanlab.larm.threads.*;
  +import de.lanlab.larm.net.*;
   
   /**
  - * this factory simply creates fetcher threads. It's passed
  - * to the ThreadPool because the pool is creating the threads on its own
  - * @version $Id$
  + * this factory simply creates fetcher threads. It's passed to the ThreadPool
  + * because the pool is creating the threads on its own
  + *
  + * @author    Administrator
  + * @created   14. Juni 2002
  + * @version   $Id: FetcherThreadFactory.java,v 1.2 2002/05/22 23:09:17
  + *      cmarschner Exp $
    */
   public class FetcherThreadFactory extends ThreadFactory
   {
  @@ -69,16 +74,28 @@
   
       HostManager hostManager;
   
  +
  +    /**
  +     * Constructor for the FetcherThreadFactory object
  +     *
  +     * @param hostManager  Description of the Parameter
  +     */
       public FetcherThreadFactory(HostManager hostManager)
       {
           this.hostManager = hostManager;
       }
   
   
  -    public  ServerThread createServerThread(int count)
  +    /**
  +     * Description of the Method
  +     *
  +     * @param count  Description of the Parameter
  +     * @return       Description of the Return Value
  +     */
  +    public ServerThread createServerThread(int count)
       {
           ServerThread newThread = new FetcherThread(count, threadGroup, hostManager);
           newThread.setPriority(4);
           return newThread;
       }
  -}
  \ No newline at end of file
  +}
  
  
  
  1.3       +14 -13    jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java
  
  Index: RobotExclusionFilter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- RobotExclusionFilter.java	22 May 2002 23:09:17 -0000	1.2
  +++ RobotExclusionFilter.java	17 Jun 2002 13:59:28 -0000	1.3
  @@ -63,6 +63,7 @@
   import de.lanlab.larm.util.*;
   import de.lanlab.larm.threads.*;
   import HTTPClient.*;
  +import de.lanlab.larm.net.*;
   
   /**
    * this factory simply creates fetcher threads. It's gonna be passed to the
  @@ -164,13 +165,13 @@
               URLMessage urlMsg = ((URLMessage) message);
               URL url = urlMsg.getUrl();
               //assert url != null;
  -            HostInfo h = hostManager.getHostInfo(url.getHost());
  +            HostInfo h = hostManager.getHostInfo(url.getHost().toLowerCase());
               if (!h.isRobotTxtChecked() && !h.isLoadingRobotsTxt())
               {
                   log.logThreadSafe("handleRequest: starting to get robots.txt");
                   // probably this results in Race Conditions here
   
  -                rePool.doTask(new RobotExclusionTask(h), new Integer(h.id));
  +                rePool.doTask(new RobotExclusionTask(h), new Integer(h.getId()));
                   h.setLoadingRobotsTxt(true);
               }
   
  @@ -182,7 +183,7 @@
   
                       //log.logThreadSafe("handleRequest: other thread is loading");
                       // assert h.queuedRequests != null
  -                    h.queuedRequests.insert(message);
  +                    h.insertIntoQueue(message);
                       // not thread safe
                       log.logThreadSafe("handleRequest: queued file " + url);
                       return null;
  @@ -273,14 +274,14 @@
               // assert hostInfo != null;
               String threadName = Thread.currentThread().getName();
   
  -            log.logThreadSafe("task " + threadName + ": starting to load " + hostInfo.hostName);
  +            log.logThreadSafe("task " + threadName + ": starting to load " + hostInfo.getHostName());
               //hostInfo.setLoadingRobotsTxt(true);
               String[] disallows = null;
               boolean errorOccured = false;
               try
               {
                   log.logThreadSafe("task " + threadName + ": getting connection");
  -                HTTPConnection conn = new HTTPConnection(hostInfo.hostName);
  +                HTTPConnection conn = new HTTPConnection(hostInfo.getHostName());
                   conn.setTimeout(30000);
                   // wait at most 20 secs
   
  @@ -348,8 +349,8 @@
                           // crawl everything
                           hostInfo.setLoadingRobotsTxt(false);
                           log.logThreadSafe("task " + threadName + ": error occured");
  -                        log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back");
  -                        hostInfo.isLoadingRobotsTxt = false;
  +                        log.logThreadSafe("task " + threadName + ": now put " + hostInfo.getQueueSize() + " queueud requests back");
  +                        hostInfo.setLoadingRobotsTxt(false);
                           putBackURLs();
                       }
                   }
  @@ -359,8 +360,8 @@
                       {
                           hostInfo.setRobotsChecked(true, disallows);
                           log.logThreadSafe("task " + threadName + ": done");
  -                        log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back");
  -                        hostInfo.isLoadingRobotsTxt = false;
  +                        log.logThreadSafe("task " + threadName + ": now put " + hostInfo.getQueueSize() + " queueud requests back");
  +                        hostInfo.setLoadingRobotsTxt(false);
                           putBackURLs();
                       }
                   }
  @@ -373,12 +374,12 @@
            */
           private void putBackURLs()
           {
  -            while (hostInfo.queuedRequests.size() > 0)
  +            while (hostInfo.getQueueSize() > 0)
               {
  -                messageHandler.putMessage((Message) hostInfo.queuedRequests.remove());
  +                messageHandler.putMessage((Message) hostInfo.removeFromQueue());
               }
               log.logThreadSafe("task " + Thread.currentThread().getName() + ": finished");
  -            hostInfo.queuedRequests = null;
  +            hostInfo.removeQueue();
           }
   
   
  
  
  
  1.3       +2 -1      jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/ThreadMonitor.java
  
  Index: ThreadMonitor.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/ThreadMonitor.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- ThreadMonitor.java	22 May 2002 23:09:17 -0000	1.2
  +++ ThreadMonitor.java	17 Jun 2002 13:59:28 -0000	1.3
  @@ -61,6 +61,7 @@
   import java.io.*;
   import de.lanlab.larm.util.State;
   import de.lanlab.larm.util.SimpleLoggerManager;
  +import de.lanlab.larm.net.*;
   
   /**
    * this monitor takes a sample of every thread every x milliseconds,
  
  
  
  1.3       +177 -60   jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLMessage.java
  
  Index: URLMessage.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLMessage.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- URLMessage.java	22 May 2002 23:09:17 -0000	1.2
  +++ URLMessage.java	17 Jun 2002 13:59:28 -0000	1.3
  @@ -1,66 +1,71 @@
  -/* ====================================================================
  - * The Apache Software License, Version 1.1
  +/*
  + *  ====================================================================
  + *  The Apache Software License, Version 1.1
    *
  - * Copyright (c) 2001 The Apache Software Foundation.  All rights
  - * reserved.
  + *  Copyright (c) 2001 The Apache Software Foundation.  All rights
  + *  reserved.
    *
  - * Redistribution and use in source and binary forms, with or without
  - * modification, are permitted provided that the following conditions
  - * are met:
  - *
  - * 1. Redistributions of source code must retain the above copyright
  - *    notice, this list of conditions and the following disclaimer.
  - *
  - * 2. Redistributions in binary form must reproduce the above copyright
  - *    notice, this list of conditions and the following disclaimer in
  - *    the documentation and/or other materials provided with the
  - *    distribution.
  - *
  - * 3. The end-user documentation included with the redistribution,
  - *    if any, must include the following acknowledgment:
  - *       "This product includes software developed by the
  - *        Apache Software Foundation (http://www.apache.org/)."
  - *    Alternately, this acknowledgment may appear in the software itself,
  - *    if and wherever such third-party acknowledgments normally appear.
  - *
  - * 4. The names "Apache" and "Apache Software Foundation" and
  - *    "Apache Lucene" must not be used to endorse or promote products
  - *    derived from this software without prior written permission. For
  - *    written permission, please contact apache@apache.org.
  - *
  - * 5. Products derived from this software may not be called "Apache",
  - *    "Apache Lucene", nor may "Apache" appear in their name, without
  - *    prior written permission of the Apache Software Foundation.
  - *
  - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  - * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  - * SUCH DAMAGE.
  - * ====================================================================
  - *
  - * This software consists of voluntary contributions made by many
  - * individuals on behalf of the Apache Software Foundation.  For more
  - * information on the Apache Software Foundation, please see
  - * <http://www.apache.org/>.
  + *  Redistribution and use in source and binary forms, with or without
  + *  modification, are permitted provided that the following conditions
  + *  are met:
  + *
  + *  1. Redistributions of source code must retain the above copyright
  + *  notice, this list of conditions and the following disclaimer.
  + *
  + *  2. Redistributions in binary form must reproduce the above copyright
  + *  notice, this list of conditions and the following disclaimer in
  + *  the documentation and/or other materials provided with the
  + *  distribution.
  + *
  + *  3. The end-user documentation included with the redistribution,
  + *  if any, must include the following acknowledgment:
  + *  "This product includes software developed by the
  + *  Apache Software Foundation (http://www.apache.org/)."
  + *  Alternately, this acknowledgment may appear in the software itself,
  + *  if and wherever such third-party acknowledgments normally appear.
  + *
  + *  4. The names "Apache" and "Apache Software Foundation" and
  + *  "Apache Lucene" must not be used to endorse or promote products
  + *  derived from this software without prior written permission. For
  + *  written permission, please contact apache@apache.org.
  + *
  + *  5. Products derived from this software may not be called "Apache",
  + *  "Apache Lucene", nor may "Apache" appear in their name, without
  + *  prior written permission of the Apache Software Foundation.
  + *
  + *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  + *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  + *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  + *  DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  + *  ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  + *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  + *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  + *  USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  + *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  + *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  + *  OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  + *  SUCH DAMAGE.
  + *  ====================================================================
  + *
  + *  This software consists of voluntary contributions made by many
  + *  individuals on behalf of the Apache Software Foundation.  For more
  + *  information on the Apache Software Foundation, please see
  + *  <http://www.apache.org/>.
    */
  -
   package de.lanlab.larm.fetcher;
   
   import java.net.*;
   import java.io.*;
   import de.lanlab.larm.util.URLUtils;
  +import de.lanlab.larm.net.URLNormalizer;
  +import de.lanlab.larm.net.HostManager;
   
   /**
    * represents a URL which is passed around in the messageHandler
  - * @version $Id$
  + *
  + * @author    Administrator
  + * @created   14. Juni 2002
  + * @version   $Id$
    */
   public class URLMessage implements Message, Serializable
   {
  @@ -68,14 +73,51 @@
        * the URL
        */
       protected URL url;
  -    protected String urlString;
   
  +    /**
  +     * Description of the Field
  +     */
  +    protected volatile String urlString;
  +
  +    /**
  +     * referer or null
  +     */
       protected URL referer;
  -    protected String refererString;
  +
  +    /**
  +     * externalized referer URL, to prevent multiple calls to url.toExternalForm()
  +     */
  +    protected volatile String refererString;
  +
  +    /**
  +     * externalized referer URL, to prevent multiple calls to url.toExternalForm()
  +     */
  +    protected volatile String refererNormalizedString;
  +
  +    /**
  +     * normalized URL, as defined by {@link de.lanlab.larm.net.URLNormalizer}
  +     * (lower case, index.* removed, all characters except alphanumeric ones escaped)
  +     */
  +    protected String normalizedURLString;
  +
  +
       boolean isFrame;
  +
  +    /**
  +     * anchor text, as in &lt;a href="..."&gt;Anchor&lt;/a&gt;
  +     */
       protected String anchor;
   
  -    public URLMessage(URL url, URL referer, boolean isFrame, String anchor)
  +
  +    /**
  +     * Constructor for the URLMessage object
  +     *
  +     * @param url      Description of the Parameter
  +     * @param referer  Description of the Parameter
  +     * @param isFrame  Description of the Parameter
  +     * @param anchor   Description of the Parameter
  +     */
  +    public URLMessage(URL url, URL referer, boolean isFrame, String anchor, HostManager hostManager)
       {
           //super();
           this.url = url;
  @@ -83,69 +125,144 @@
   
           this.referer = referer;
           this.refererString = referer != null ? URLUtils.toExternalFormNoRef(referer) : null;
  +        this.refererNormalizedString = referer != null ? URLUtils.toExternalFormNoRef(URLNormalizer.normalize(referer, hostManager)) : null;
           this.isFrame = isFrame;
           this.anchor = anchor != null ? anchor : "";
  +        this.normalizedURLString = URLUtils.toExternalFormNoRef(URLNormalizer.normalize(url, hostManager));
  +        //this.normalizedURLString = URLNormalizer.
           //System.out.println("" + refererString + " -> " + urlString);
       }
   
  +    public String getNormalizedURLString()
  +    {
  +        return this.normalizedURLString;
  +    }
  +
  +    /**
  +     * Gets the url attribute of the URLMessage object
  +     *
  +     * @return   The url value
  +     */
       public URL getUrl()
       {
           return this.url;
       }
   
  +
  +    /**
  +     * Gets the referer attribute of the URLMessage object
  +     *
  +     * @return   The referer value
  +     */
       public URL getReferer()
       {
           return this.referer;
       }
   
   
  +    /**
  +     * Description of the Method
  +     *
  +     * @return   Description of the Return Value
  +     */
       public String toString()
       {
           return urlString;
       }
   
  +
  +    /**
  +     * Gets the uRLString attribute of the URLMessage object
  +     *
  +     * @return   The uRLString value
  +     */
       public String getURLString()
       {
           return urlString;
       }
   
  +
  +    /**
  +     * Gets the refererString attribute of the URLMessage object
  +     *
  +     * @return   The refererString value
  +     */
       public String getRefererString()
       {
           return refererString;
       }
   
  +
  +    /**
  +     * Gets the anchor attribute of the URLMessage object
  +     *
  +     * @return   The anchor value
  +     */
       public String getAnchor()
       {
           return anchor;
       }
   
   
  +    /**
  +     * Description of the Method
  +     *
  +     * @return   Description of the Return Value
  +     */
       public int hashCode()
       {
           return url.hashCode();
       }
   
  -    private void writeObject(java.io.ObjectOutputStream out) throws IOException
  +
  +    /**
  +     * Description of the Method
  +     *
  +     * @param out              Description of the Parameter
  +     * @exception IOException  Description of the Exception
  +     */
  +    private void writeObject(java.io.ObjectOutputStream out)
  +        throws IOException
       {
           out.writeObject(url);
           out.writeObject(referer);
           out.writeBoolean(isFrame);
           out.writeUTF(anchor);
  +        out.writeUTF(refererNormalizedString);
  +        out.writeUTF(normalizedURLString);
  +
       }
   
  -    private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException
  +
  +    /**
  +     * Description of the Method
  +     *
  +     * @param in                          Description of the Parameter
  +     * @exception IOException             Description of the Exception
  +     * @exception ClassNotFoundException  Description of the Exception
  +     */
  +    private void readObject(java.io.ObjectInputStream in)
  +        throws IOException, ClassNotFoundException
       {
  -        url = (URL)in.readObject();
  -        referer = (URL)in.readObject();
  +        url = (URL) in.readObject();
  +        referer = (URL) in.readObject();
           urlString = url.toExternalForm();
           refererString = referer.toExternalForm();
           isFrame = in.readBoolean();
           anchor = in.readUTF();
  +        refererNormalizedString = in.readUTF();
  +        normalizedURLString = in.readUTF();
       }
   
  +
  +    /**
  +     * Gets the info attribute of the URLMessage object
  +     *
  +     * @return   The info value
  +     */
       public String getInfo()
       {
  -        return (referer != null ? refererString : "<start>") + "\t" + urlString + "\t" + (isFrame ? "1" : "0") + "\t" + anchor;
  +        return (referer != null ? refererString : "<start>") + "\t" + urlString + "\t" + this.getNormalizedURLString() + "\t" + (isFrame ? "1" : "0") + "\t" + anchor;
       }
   
   }
  
  
  
  1.4       +2 -2      jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLVisitedFilter.java
  
  Index: URLVisitedFilter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLVisitedFilter.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- URLVisitedFilter.java	1 Jun 2002 18:55:15 -0000	1.3
  +++ URLVisitedFilter.java	17 Jun 2002 13:59:28 -0000	1.4
  @@ -123,7 +123,7 @@
           {
               URLMessage urlMessage = ((URLMessage) message);
               URL url = urlMessage.getUrl();
  -            String urlString = urlMessage.getURLString();
  +            String urlString = urlMessage.getNormalizedURLString();
               if (urlHash.contains(urlString))
               {
                   //System.out.println("URLVisitedFilter: " + urlString + " already present.");
  
  
  

--
To unsubscribe, e-mail:   <ma...@jakarta.apache.org>
For additional commands, e-mail: <ma...@jakarta.apache.org>