You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by do...@apache.org on 2007/08/21 12:50:19 UTC
svn commit: r568053 [3/3] - in /lucene/nutch/trunk: ./ conf/ src/java/org/apache/nutch/util/ src/java/org/apache/nutch/util/domain/ src/plugin/ src/plugin/tld/ src/plugin/tld/src/ src/plugin/tld/src/java/ src/plugin/tld/src/java/org/ src/plugin/tld/src...

Added: lucene/nutch/trunk/conf/domain-suffixes.xsd
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/domain-suffixes.xsd?rev=568053&view=auto
==============================================================================
--- lucene/nutch/trunk/conf/domain-suffixes.xsd (added)
+++ lucene/nutch/trunk/conf/domain-suffixes.xsd Tue Aug 21 03:50:07 2007
@@ -0,0 +1,130 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+<!--
+  Document   : domain-suffixes.xsd
+  Author     : Enis Soztutar - enis.soz.nutch@gmail.com
+  Description: This document is the schema for valid domain-suffixes
+  definitions. For successful parsing of domain-suffixes xml files, 
+  the xml file should be validated with this xsd. 
+  See        : org.apache.nutch.util.domain.DomainSuffixesReader.java
+-->
+
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
+  targetNamespace="http://lucene.apache.org/nutch"
+  xmlns="http://lucene.apache.org/nutch"
+  elementFormDefault="qualified">
+
+  <xs:element name="domains">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element name="tlds">
+          <xs:complexType>
+            <xs:sequence>
+              <xs:element name="itlds">
+                <xs:complexType>
+                  <xs:sequence>
+                    <xs:element name="tld" maxOccurs="unbounded"
+                      type="gtld" />
+                  </xs:sequence>
+                </xs:complexType>
+              </xs:element>
+
+              <xs:element name="gtlds">
+                <xs:complexType>
+                  <xs:sequence>
+                    <xs:element name="tld" maxOccurs="unbounded"
+                      type="gtld" />
+                  </xs:sequence>
+                </xs:complexType>
+              </xs:element>
+
+              <xs:element name="cctlds">
+                <xs:complexType>
+                  <xs:sequence>
+                    <xs:element name="tld" maxOccurs="unbounded"
+                      type="cctld" />
+                  </xs:sequence>
+                </xs:complexType>
+              </xs:element>
+
+            </xs:sequence>
+          </xs:complexType>
+        </xs:element>
+
+        <xs:element name="suffixes">
+          <xs:complexType>
+            <xs:sequence>
+              <xs:element name="suffix" maxOccurs="unbounded"
+                type="sldType" />
+            </xs:sequence>
+          </xs:complexType>
+        </xs:element>
+      </xs:sequence>
+    </xs:complexType>
+  </xs:element>
+
+  <xs:complexType name="gtld">
+    <xs:sequence>
+      <xs:element name="status" minOccurs="0">
+        <xs:simpleType>
+          <xs:restriction base="xs:string">
+            <xs:enumeration value="INFRASTRUCTURE" />
+            <xs:enumeration value="SPONSORED" />
+            <xs:enumeration value="UNSPONSORED" />
+            <xs:enumeration value="STARTUP" />
+            <xs:enumeration value="PROPOSED" />
+            <xs:enumeration value="DELETED" />
+            <xs:enumeration value="PSEUDO_DOMAIN" />
+          </xs:restriction>
+        </xs:simpleType>
+      </xs:element>
+      <xs:element name="boost" type="xs:float" minOccurs="0" />
+      <xs:element name="description" type="xs:string" minOccurs="0" />
+    </xs:sequence>
+    <xs:attribute name="domain" type="xs:string" />
+  </xs:complexType>
+
+  <xs:complexType name="cctld">
+    <xs:sequence>
+      <xs:element name="country" type="xs:string" />
+      <xs:element name="status" type="statusType" minOccurs="0" />
+      <xs:element name="boost" type="xs:float" minOccurs="0" />
+      <xs:element name="description" type="xs:string" minOccurs="0" />
+    </xs:sequence>
+    <xs:attribute name="domain" type="xs:string" />
+  </xs:complexType>
+
+  <xs:complexType name="sldType">
+    <xs:sequence>
+      <xs:element name="status" type="statusType" minOccurs="0" />
+      <xs:element name="boost" type="xs:float" minOccurs="0" />
+      <xs:element name="description" type="xs:string" minOccurs="0" />
+    </xs:sequence>
+    <xs:attribute name="domain" type="xs:string" />
+  </xs:complexType>
+
+  <xs:simpleType name="statusType">
+    <xs:restriction base="xs:string">
+      <xs:enumeration value="IN_USE" />
+      <xs:enumeration value="NOT_IN_USE" />
+      <xs:enumeration value="DELETED" />
+    </xs:restriction>
+  </xs:simpleType>
+
+</xs:schema>

Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=568053&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Tue Aug 21 03:50:07 2007
@@ -0,0 +1,160 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.regex.Pattern;
+
+import org.apache.nutch.util.domain.DomainSuffix;
+import org.apache.nutch.util.domain.DomainSuffixes;
+
+/** Utility class for URL analysis */
+public class URLUtil {
+
+  private static Pattern IP_PATTERN = Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})");
+
+  /** Returns the domain name of the url. The domain name of a url is
+   *  the substring of the url's hostname, w/o subdomain names. As an
+   *  example <br><code>
+   *  getDomainName(conf, new URL(http://lucene.apache.org/))
+   *  </code><br>
+   *  will return <br><code> apache.org</code>
+   *   */
+  public static String getDomainName(URL url) {
+    DomainSuffixes tlds = DomainSuffixes.getInstance();
+    String host = url.getHost();
+    //it seems that java returns hostnames ending with .
+    if(host.endsWith("."))
+      host = host.substring(0, host.length() - 1);
+    if(IP_PATTERN.matcher(host).matches())
+      return host;
+    
+    int index = 0;
+    String candidate = host;
+    for(;index >= 0;) {
+      index = candidate.indexOf('.');
+      String subCandidate = candidate.substring(index+1); 
+      if(tlds.isDomainSuffix(subCandidate)) {
+        return candidate; 
+      }
+      candidate = subCandidate;
+    }
+    return candidate;
+  }
+
+  /** Returns the domain name of the url. The domain name of a url is
+   *  the substring of the url's hostname, w/o subdomain names. As an
+   *  example <br><code>
+   *  getDomainName(conf, new http://lucene.apache.org/)
+   *  </code><br>
+   *  will return <br><code> apache.org</code>
+   * @throws MalformedURLException
+   */
+  public static String getDomainName(String url) throws MalformedURLException {
+    return getDomainName(new URL(url));
+  }
+
+  /** Returns whether the given urls have the same domain name.
+   * As an example, <br>
+   * <code> isSameDomain(new URL("http://lucene.apache.org")
+   * , new URL("http://people.apache.org/"))
+   * <br> will return true. </code>
+   *
+   * @return true if the domain names are equal
+   */
+  public static boolean isSameDomainName(URL url1, URL url2) {
+    return getDomainName(url1).equalsIgnoreCase(getDomainName(url2));
+  }
+
+  /**Returns whether the given urls have the same domain name.
+  * As an example, <br>
+  * <code> isSameDomain("http://lucene.apache.org"
+  * ,"http://people.apache.org/")
+  * <br> will return true. </code>
+  * @return true if the domain names are equal
+  * @throws MalformedURLException
+  */
+  public static boolean isSameDomainName(String url1, String url2)
+    throws MalformedURLException {
+    return isSameDomainName(new URL(url1), new URL(url2));
+  }
+
+  /** Returns the {@link DomainSuffix} corresponding to the
+   * last public part of the hostname
+   */
+  public static DomainSuffix getDomainSuffix(URL url) {
+    DomainSuffixes tlds = DomainSuffixes.getInstance();
+    String host = url.getHost();
+    if(IP_PATTERN.matcher(host).matches())
+      return null;
+    
+    int index = 0;
+    String candidate = host;
+    for(;index >= 0;) {
+      index = candidate.indexOf('.');
+      String subCandidate = candidate.substring(index+1);
+      DomainSuffix d = tlds.get(subCandidate);
+      if(d != null) {
+        return d; 
+      }
+      candidate = subCandidate;
+    }
+    return null;
+  }
+
+  /** Returns the {@link DomainSuffix} corresponding to the
+   * last public part of the hostname
+   */
+  public static DomainSuffix getDomainSuffix(String url) throws MalformedURLException {
+    return getDomainSuffix(new URL(url));
+  }
+
+  /** Partitions of the hostname of the url by "."  */
+  public static String[] getHostSegments(URL url) {
+    String host = url.getHost();
+    //return whole hostname, if it is an ipv4
+    //TODO : handle ipv6
+    if(IP_PATTERN.matcher(host).matches())
+      return new String[] {host};
+    return host.split("\\.");
+  }
+
+  /** Partitions of the hostname of the url by "."
+   * @throws MalformedURLException */
+  public static String[] getHostSegments(String url) throws MalformedURLException {
+   return getHostSegments(new URL(url));
+  }
+
+  /** For testing */
+  public static void main(String[] args){
+    
+    if(args.length!=1) {
+      System.err.println("Usage : URLUtil <url>");
+      return ;
+    }
+    
+    String url = args[0];
+    try {
+      System.out.println(URLUtil.getDomainName(new URL(url)));
+    }
+    catch (MalformedURLException ex) {
+      ex.printStackTrace();
+    }
+  }
+}

Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java?rev=568053&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java Tue Aug 21 03:50:07 2007
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+/**
+ * This class represents the last part of the host name, 
+ * which is operated by authoritives, not individuals. This information 
+ * is needed to find the domain name of a host. The domain name of a host
+ * is defined to be the last part before the domain suffix, w/o subdomain 
+ * names.  As an example the domain name of <br><code> http://lucene.apache.org/ 
+ * </code><br> is <code> apache.org</code>   
+ * <br>
+ * This class holds three fields,  
+ * <strong>domain</strong> field represents the suffix (such as "co.uk")
+ * <strong>boost</strong> is a float for boosting score of url's with this suffix
+ * <strong>status</strong> field represents domain's status
+ * 
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ * @see TopLevelDomain
+ * @see domain-suffixes.xml
+ */
+public class DomainSuffix {
+
+  /**
+   * Enumeration of the status of the tld. Please see domain-suffixes.xml. 
+   */
+  public enum Status { INFRASTRUCTURE, SPONSORED, UNSPONSORED
+    , STARTUP, PROPOSED, DELETED, PSEUDO_DOMAIN, DEPRECATED, IN_USE, NOT_IN_USE, REJECTED
+  };
+
+  private String domain;
+  private Status status;
+  private float boost;
+
+  public static final float DEFAULT_BOOST = 1.0f;
+  public static final Status DEFAULT_STATUS = Status.IN_USE;
+  
+  public DomainSuffix(String domain, Status status, float boost) {
+    this.domain = domain;
+    this.status = status;
+    this.boost = boost;
+  }
+
+  public DomainSuffix(String domain) {
+    this(domain, DEFAULT_STATUS, DEFAULT_BOOST);
+  }
+  
+  public String getDomain() {
+    return domain;
+  }
+
+  public Status getStatus() {
+    return status;
+  }
+
+  public float getBoost() {
+    return boost;
+  }
+  
+  @Override
+  public String toString() {
+    return domain;
+  }
+}

Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java?rev=568053&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java Tue Aug 21 03:50:07 2007
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+import java.io.InputStream;
+import java.util.HashMap;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.util.StringUtils;
+
+/**
+ * Storage class for <code>DomainSuffix</code> objects 
+ * Note: this class is singleton
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+public class DomainSuffixes {
+  private static final Log LOG = LogFactory.getLog(DomainSuffixes.class);
+  
+  private HashMap<String, DomainSuffix> domains = new HashMap<String, DomainSuffix>(); 
+  
+  private static DomainSuffixes instance;
+  
+  /** private ctor */
+  private DomainSuffixes() {
+    String file = "domain-suffixes.xml";
+    InputStream input = this.getClass().getClassLoader().getResourceAsStream(file);
+    try {
+      new DomainSuffixesReader().read(this, input);
+    }
+    catch (Exception ex) {
+      LOG.warn(StringUtils.stringifyException(ex));
+    }
+  }
+  
+  /**
+   * Singleton instance, lazy instantination
+   * @return
+   */
+  public static DomainSuffixes getInstance() {
+    if(instance == null) {
+      instance = new DomainSuffixes();
+    }
+    return instance;
+  }
+  
+  void addDomainSuffix(DomainSuffix tld) {
+    domains.put(tld.getDomain(), tld);
+  }
+
+  /** return whether the extension is a registered domain entry */
+  public boolean isDomainSuffix(String extension) {
+    return domains.containsKey(extension); 
+  }
+    
+  /**
+   * Return the {@link DomainSuffix} object for the extension, if 
+   * extension is a top level domain returned object will be an 
+   * instance of {@link TopLevelDomain}
+   * @param extension of the domain
+   */
+  public DomainSuffix get(String extension) {
+    return domains.get(extension);
+  }
+  
+}

Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java?rev=568053&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java Tue Aug 21 03:50:07 2007
@@ -0,0 +1,159 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.util.domain.DomainSuffix.Status;
+import org.apache.nutch.util.domain.TopLevelDomain.Type;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+/**
+ * For parsing xml files containing domain suffix definitions.
+ * Parsed xml files should validate against 
+ * <code>domain-suffixes.xsd</code>  
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+class DomainSuffixesReader {
+
+  private static final Log LOG = LogFactory.getLog(DomainSuffixesReader.class);
+
+  void read(DomainSuffixes tldEntries, InputStream input) throws IOException{
+    try {
+
+      DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+      factory.setIgnoringComments(true);
+      DocumentBuilder builder = factory.newDocumentBuilder();
+      Document document = builder.parse(new InputSource(input));
+
+      Element root = document.getDocumentElement();
+      
+      if(root != null && root.getTagName().equals("domains")) {
+        
+        Element tlds = (Element)root.getElementsByTagName("tlds").item(0);
+        Element suffixes = (Element)root.getElementsByTagName("suffixes").item(0);
+        
+        //read tlds
+        readITLDs(tldEntries, (Element)tlds.getElementsByTagName("itlds").item(0));
+        readGTLDs(tldEntries, (Element)tlds.getElementsByTagName("gtlds").item(0));
+        readCCTLDs(tldEntries, (Element)tlds.getElementsByTagName("cctlds").item(0));
+        
+        readSuffixes(tldEntries, suffixes);
+      }
+      else {
+        throw new IOException("xml file is not valid");
+      }
+    }
+    catch (ParserConfigurationException ex) {
+      LOG.warn(StringUtils.stringifyException(ex));
+      throw new IOException(ex.getMessage());
+    }
+    catch (SAXException ex) {
+      LOG.warn(StringUtils.stringifyException(ex));
+      throw new IOException(ex.getMessage());
+    }
+  }
+
+  void readITLDs(DomainSuffixes tldEntries, Element el) {
+    NodeList children = el.getElementsByTagName("tld");
+    for(int i=0;i<children.getLength();i++) {
+      tldEntries.addDomainSuffix(readGTLD((Element)children.item(i), Type.INFRASTRUCTURE));
+    }
+  }
+    
+  void readGTLDs(DomainSuffixes tldEntries, Element el) {
+    NodeList children = el.getElementsByTagName("tld");
+    for(int i=0;i<children.getLength();i++) {
+      tldEntries.addDomainSuffix(readGTLD((Element)children.item(i), Type.GENERIC));
+    }
+  }
+
+  void readCCTLDs(DomainSuffixes tldEntries, Element el) throws IOException {
+    NodeList children = el.getElementsByTagName("tld");
+    for(int i=0;i<children.getLength();i++) {
+      tldEntries.addDomainSuffix(readCCTLD((Element)children.item(i)));
+    }
+  }
+
+  TopLevelDomain readGTLD(Element el, Type type) {
+    String domain = el.getAttribute("domain");
+    Status status = readStatus(el);
+    float boost = readBoost(el);
+    return new TopLevelDomain(domain, type, status, boost);
+  }
+
+  TopLevelDomain readCCTLD(Element el) throws IOException {
+    String domain = el.getAttribute("domain");
+    Status status = readStatus(el);
+    float boost = readBoost(el);
+    String countryName = readCountryName(el); 
+    return new TopLevelDomain(domain, status, boost, countryName);  
+  }
+  
+  /** read optional field status */
+  Status readStatus(Element el) {
+    NodeList list = el.getElementsByTagName("status");
+    if(list == null || list.getLength() == 0)
+      return DomainSuffix.DEFAULT_STATUS;
+    return Status.valueOf(list.item(0).getFirstChild().getNodeValue());
+  }
+  
+  /** read optional field boost */
+  float readBoost(Element el) {
+    NodeList list = el.getElementsByTagName("boost");
+    if(list == null || list.getLength() == 0)
+      return DomainSuffix.DEFAULT_BOOST;
+    return Float.parseFloat(list.item(0).getFirstChild().getNodeValue());
+  }
+  
+  /** read field countryname 
+    */
+  String readCountryName(Element el) throws IOException {
+    NodeList list = el.getElementsByTagName("country");
+    if(list == null || list.getLength() == 0)
+      throw new IOException("Country name should be given");
+    return list.item(0).getNodeValue();
+  }
+  
+  void readSuffixes(DomainSuffixes tldEntries, Element el) {
+    NodeList children = el.getElementsByTagName("suffix");
+    for(int i=0;i<children.getLength();i++) {
+      tldEntries.addDomainSuffix(readSuffix((Element)children.item(i)));
+    }
+  }
+
+  DomainSuffix readSuffix(Element el) {
+    String domain = el.getAttribute("domain");
+    Status status = readStatus(el);
+    float boost = readBoost(el);
+    return new DomainSuffix(domain, status, boost);
+  }
+  
+}

Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java?rev=568053&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java Tue Aug 21 03:50:07 2007
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+/**
+ * (From wikipedia) A top-level domain (TLD) is the last part of an 
+ * Internet domain name; that is, the letters which follow the final 
+ * dot of any domain name. For example, in the domain name 
+ * <code>www.website.com</code>, the top-level domain is <code>com</code>.
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ * @see http://www.iana.org/
+ * @see http://en.wikipedia.org/wiki/Top-level_domain
+ */
+public class TopLevelDomain extends DomainSuffix {
+
+  public enum Type { INFRASTRUCTURE, GENERIC, COUNTRY };
+  
+  private Type type;
+  private String countryName = null;
+  
+  public TopLevelDomain(String domain, Type type, Status status, float boost){
+    super(domain, status, boost);
+    this.type = type;
+  }
+
+  public TopLevelDomain(String domain, Status status, float boost, String countryName){
+    super(domain, status, boost);
+    this.type = Type.COUNTRY;
+    this.countryName = countryName;
+  }
+  
+  public Type getType() {
+    return type;
+  }
+
+  /** Returns the country name if TLD is Country Code TLD
+   * @return country name or null
+   */ 
+  public String getCountryName(){
+    return countryName;
+  }
+  
+}

Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/package.html
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/package.html?rev=568053&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/package.html (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/domain/package.html Tue Aug 21 03:50:07 2007
@@ -0,0 +1,16 @@
+<html>
+<body>
+<h2> org.apache.nutch.util.domain</h2>
+
+<p>This package contains classes for domain analysis.</p>
+
+for information please refer to following urls : 
+<ul>
+<li><a href="http://en.wikipedia.org/wiki/DNS">http://en.wikipedia.org/wiki/DNS</a></li>
+<li><a href="http://en.wikipedia.org/wiki/Top-level_domain">http://en.wikipedia.org/wiki/Top-level_domain</a></li>
+<li><a href="http://wiki.mozilla.org/TLD_List">http://wiki.mozilla.org/TLD_List</a></li>
+<li><a href="http://publicsuffix.org/">http://publicsuffix.org/</a></li>
+</ul>
+
+</body>
+</html>

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=568053&r1=568052&r2=568053&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Tue Aug 21 03:50:07 2007
@@ -68,6 +68,7 @@
      <ant dir="summary-basic" target="deploy"/>
      <ant dir="subcollection" target="deploy"/>
      <ant dir="summary-lucene" target="deploy"/>
+     <ant dir="tld" target="deploy"/>
      <ant dir="urlfilter-automaton" target="deploy"/>
      <ant dir="urlfilter-prefix" target="deploy"/>
      <ant dir="urlfilter-regex" target="deploy"/>
@@ -158,6 +159,7 @@
     <ant dir="subcollection" target="clean"/>
     <ant dir="summary-basic" target="clean"/>
     <ant dir="summary-lucene" target="clean"/>
+    <ant dir="tld" target="clean"/>
     <ant dir="urlfilter-automaton" target="clean"/>
     <ant dir="urlfilter-prefix" target="clean"/>
     <ant dir="urlfilter-regex" target="clean"/>

Added: lucene/nutch/trunk/src/plugin/tld/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/tld/build.xml?rev=568053&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/tld/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/tld/build.xml Tue Aug 21 03:50:07 2007
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="tld" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

Added: lucene/nutch/trunk/src/plugin/tld/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/tld/plugin.xml?rev=568053&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/tld/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/tld/plugin.xml Tue Aug 21 03:50:07 2007
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="tld"
+   name="Top Level Domain Plugin"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="tld.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.indexer.tld"
+              name="Top Level Domain Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="TLDIndexingFilter"
+                      class="org.apache.nutch.indexer.tld.TLDIndexingFilter"/>
+   </extension>
+
+   <extension id="org.apache.nutch.scoring.tld"
+              name="Top Level Domain Scoring Filter"
+              point="org.apache.nutch.scoring.ScoringFilter">
+
+      <implementation id="org.apache.nutch.scoring.tld.TLDScoringFilter"
+                      class="org.apache.nutch.scoring.tld.TLDScoringFilter" />
+   </extension>
+
+
+</plugin>

Added: lucene/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java?rev=568053&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java Tue Aug 21 03:50:07 2007
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.tld;
+
+import java.net.URL;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.domain.DomainSuffix;
+
+/**
+ * Adds the Top level domain extensions to the index
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+public class TLDIndexingFilter implements IndexingFilter {
+  public static final Log LOG = LogFactory.getLog(TLDIndexingFilter.class);
+
+  private Configuration conf;
+
+  public Document filter(Document doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks)
+  throws IndexingException {
+
+    try {
+      URL url = new URL(urlText.toString());
+      DomainSuffix d = URLUtil.getDomainSuffix(url);
+      
+      // store, no index
+      doc.add(new Field("tld", d.getDomain(), Field.Store.YES, Field.Index.NO));
+      
+    }catch (Exception ex) {
+      LOG.warn(ex);
+    }
+
+    return doc;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

Added: lucene/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/package.html
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/package.html?rev=568053&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/package.html (added)
+++ lucene/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/package.html Tue Aug 21 03:50:07 2007
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Top Level Domain Indexing plugin.</p><p></p>
+</body>
+</html>

Added: lucene/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java?rev=568053&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java Tue Aug 21 03:50:07 2007
@@ -0,0 +1,113 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.scoring.tld;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Collection;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.document.Document;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.util.domain.DomainSuffix;
+import org.apache.nutch.util.domain.DomainSuffixes;
+
+
+/**
+ * Scoring filter to boost tlds.
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+public class TLDScoringFilter implements ScoringFilter {
+
+  private Configuration conf;
+  private DomainSuffixes tldEntries;
+
+  public TLDScoringFilter() {
+    tldEntries = DomainSuffixes.getInstance();
+  }
+
+  public float indexerScore(Text url, Document doc, CrawlDatum dbDatum,
+      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+      throws ScoringFilterException {
+
+    String[] tlds = doc.getValues("tld");
+    float boost = 1.0f;
+
+    if(tlds != null) {
+      for(String tld : tlds) {
+        DomainSuffix entry = tldEntries.get(tld);
+        if(entry != null)
+          boost *= entry.getBoost();
+      }
+    }
+    return initScore * boost;
+  }
+
+  public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl,
+      ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount,
+      int validCount) throws ScoringFilterException {
+    return adjust;
+  }
+
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+      throws ScoringFilterException {
+    return initSort;
+  }
+
+  public void initialScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+  }
+
+  public void injectedScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+  }
+
+  public void passScoreAfterParsing(Text url, Content content, Parse parse)
+      throws ScoringFilterException {
+  }
+
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
+      throws ScoringFilterException {
+  }
+
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List inlinked) throws ScoringFilterException {
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, 
+          Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust,
+          int allCount) throws ScoringFilterException {
+    return adjust;
+  }
+
+}

Added: lucene/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package.html
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package.html?rev=568053&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package.html (added)
+++ lucene/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package.html Tue Aug 21 03:50:07 2007
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Top Level Domain Scoring plugin.</p><p></p>
+</body>
+</html>

Added: lucene/nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java?rev=568053&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java Tue Aug 21 03:50:07 2007
@@ -0,0 +1,163 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.net.URL;
+
+import junit.framework.TestCase;
+
+/** Test class for URLUtil */
+public class TestURLUtil extends TestCase {
+
+  @Override
+  protected void setUp() throws Exception {
+    super.setUp();
+  }
+
+  public void testGetDomainName() throws Exception{
+
+    URL url = null;
+
+    url = new URL("http://lucene.apache.org/nutch");
+    assertEquals("apache.org", URLUtil.getDomainName(url));
+
+    url = new URL("http://en.wikipedia.org/wiki/Java_coffee");
+    assertEquals("wikipedia.org", URLUtil.getDomainName(url));
+
+    url = new URL("http://140.211.11.130/foundation/contributing.html");
+    assertEquals("140.211.11.130", URLUtil.getDomainName(url));
+
+    url = new URL("http://www.example.co.uk:8080/index.html");
+    assertEquals("example.co.uk", URLUtil.getDomainName(url));
+
+    url = new URL("http://com");
+    assertEquals("com", URLUtil.getDomainName(url));
+
+    url = new URL("http://www.example.co.uk.com");
+    assertEquals("uk.com", URLUtil.getDomainName(url));
+
+    //"nn" is not a tld
+    url = new URL("http://example.com.nn");
+    assertEquals("nn", URLUtil.getDomainName(url));
+
+    url = new URL("http://");
+    assertEquals("", URLUtil.getDomainName(url));
+
+    url = new URL("http://www.edu.tr.xyz");
+    assertEquals("xyz", URLUtil.getDomainName(url));
+    
+    url = new URL("http://www.example.c.se");
+    assertEquals("example.c.se", URLUtil.getDomainName(url));
+
+    //plc.co.im is listed as a domain suffix
+    url = new URL("http://www.example.plc.co.im");
+    assertEquals("example.plc.co.im", URLUtil.getDomainName(url));
+    
+    //2000.hu is listed as a domain suffix
+    url = new URL("http://www.example.2000.hu");
+    assertEquals("example.2000.hu", URLUtil.getDomainName(url));
+    
+    //test non-ascii
+    url = new URL("http://www.example.åæ¥.tw");
+    assertEquals("example.åæ¥.tw", URLUtil.getDomainName(url));
+    
+  }
+
+  public void testGetDomainSuffix() throws Exception{
+    URL url = null;
+
+    url = new URL("http://lucene.apache.org/nutch");
+    assertEquals("org", URLUtil.getDomainSuffix(url).getDomain());
+
+    url = new URL("http://140.211.11.130/foundation/contributing.html");
+    assertNull(URLUtil.getDomainSuffix(url));
+
+    url = new URL("http://www.example.co.uk:8080/index.html");
+    assertEquals("co.uk", URLUtil.getDomainSuffix(url).getDomain());
+
+    url = new URL("http://com");
+    assertEquals("com", URLUtil.getDomainSuffix(url).getDomain());
+
+    url = new URL("http://www.example.co.uk.com");
+    assertEquals("com", URLUtil.getDomainSuffix(url).getDomain());
+
+    //"nn" is not a tld
+    url = new URL("http://example.com.nn");
+    assertNull(URLUtil.getDomainSuffix(url));
+
+    url = new URL("http://");
+    assertNull(URLUtil.getDomainSuffix(url));
+
+    url = new URL("http://www.edu.tr.xyz");
+    assertNull(URLUtil.getDomainSuffix(url));
+    
+    url = new URL("http://subdomain.example.edu.tr");
+    assertEquals("edu.tr", URLUtil.getDomainSuffix(url).getDomain());
+    
+    url = new URL("http://subdomain.example.presse.fr");
+    assertEquals("presse.fr", URLUtil.getDomainSuffix(url).getDomain());
+    
+    url = new URL("http://subdomain.example.presse.tr");
+    assertEquals("tr", URLUtil.getDomainSuffix(url).getDomain());
+   
+    //plc.co.im is listed as a domain suffix
+    url = new URL("http://www.example.plc.co.im");
+    assertEquals("plc.co.im", URLUtil.getDomainSuffix(url).getDomain());
+    
+    //2000.hu is listed as a domain suffix
+    url = new URL("http://www.example.2000.hu");
+    assertEquals("2000.hu", URLUtil.getDomainSuffix(url).getDomain());
+    
+    //test non-ascii
+    url = new URL("http://www.example.åæ¥.tw");
+    assertEquals("åæ¥.tw", URLUtil.getDomainSuffix(url).getDomain());
+    
+  }
+  
+  public void testGetHostSegments() throws Exception{
+    URL url;
+    String[] segments;
+    
+    url = new URL("http://subdomain.example.edu.tr");
+    segments = URLUtil.getHostSegments(url);
+    assertEquals("subdomain", segments[0]);
+    assertEquals("example", segments[1]);
+    assertEquals("edu", segments[2]);
+    assertEquals("tr", segments[3]);
+    
+    url = new URL("http://");
+    segments = URLUtil.getHostSegments(url);
+    assertEquals(1, segments.length);
+    assertEquals("", segments[0]);
+    
+    url = new URL("http://140.211.11.130/foundation/contributing.html");
+    segments = URLUtil.getHostSegments(url);
+    assertEquals(1, segments.length);
+    assertEquals("140.211.11.130", segments[0]);
+    
+    //test non-ascii
+    url = new URL("http://www.example.åæ¥.tw");
+    segments = URLUtil.getHostSegments(url);
+    assertEquals("www", segments[0]);
+    assertEquals("example", segments[1]);
+    assertEquals("åæ¥", segments[2]);
+    assertEquals("tw", segments[3]);
+    
+  }
+
+}