You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2012/12/06 00:26:31 UTC

svn commit: r1417696 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/benchmark/ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ lucene/licenses/

Author: uschindler
Date: Wed Dec  5 23:26:29 2012
New Revision: 1417696

URL: http://svn.apache.org/viewvc?rev=1417696&view=rev
Log:
Merged revision(s) 1417694 from lucene/dev/trunk:
LUCENE-4589: Upgraded benchmark module's Nekohtml dependency to version 1.9.17, removing the workaround in Lucene's HTML parser for the Turkish locale

Added:
    lucene/dev/branches/branch_4x/lucene/licenses/nekohtml-1.9.17.jar.sha1
      - copied unchanged from r1417694, lucene/dev/trunk/lucene/licenses/nekohtml-1.9.17.jar.sha1
Removed:
    lucene/dev/branches/branch_4x/lucene/licenses/nekohtml-1.9.15.jar.sha1
Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/benchmark/   (props changed)
    lucene/dev/branches/branch_4x/lucene/benchmark/build.xml
    lucene/dev/branches/branch_4x/lucene/benchmark/ivy.xml
    lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java
    lucene/dev/branches/branch_4x/lucene/licenses/   (props changed)

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1417696&r1=1417695&r2=1417696&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed Dec  5 23:26:29 2012
@@ -263,6 +263,10 @@ Build
   RandomizedContext.contexts static map. Upgrade randomized testing
   to version 2.0.2 (Mike McCandless, Dawid Weiss)
 
+* LUCENE-4589: Upgraded benchmark module's Nekohtml dependency to version
+  1.9.17, removing the workaround in Lucene's HTML parser for the
+  Turkish locale.  (Uwe Schindler)
+
   
 ======================= Lucene 4.0.0 =======================
 

Modified: lucene/dev/branches/branch_4x/lucene/benchmark/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/build.xml?rev=1417696&r1=1417695&r2=1417696&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/build.xml (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/build.xml Wed Dec  5 23:26:29 2012
@@ -152,7 +152,7 @@
     	<fileset dir="lib">
     	  <include name="commons-compress-1.4.1.jar"/>
     	  <include name="xercesImpl-2.9.1.jar"/>
-    	  <include name="nekohtml-1.9.15.jar"/>
+    	  <include name="nekohtml-1.9.17.jar"/>
     	</fileset>
     </path>
     <path id="run.classpath">

Modified: lucene/dev/branches/branch_4x/lucene/benchmark/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/ivy.xml?rev=1417696&r1=1417695&r2=1417696&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/ivy.xml (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/ivy.xml Wed Dec  5 23:26:29 2012
@@ -21,7 +21,7 @@
     <dependencies>
       <dependency org="org.apache.commons" name="commons-compress" rev="1.4.1" transitive="false"/>
       <dependency org="xerces" name="xercesImpl" rev="2.9.1" transitive="false"/>
-      <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.15" transitive="false"/>
+      <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.17" transitive="false"/>
       <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/> 
     </dependencies>
 </ivy-module>

Modified: lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java?rev=1417696&r1=1417695&r2=1417696&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java Wed Dec  5 23:26:29 2012
@@ -20,6 +20,7 @@ package org.apache.lucene.benchmark.byTa
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.Date;
 import java.util.HashSet;
@@ -65,10 +66,10 @@ public class DemoHTMLParser implements H
         @Override
         public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
           if (inHEAD > 0) {
-            if (equalsIgnoreTurkish("title", localName)) {
+            if ("title".equals(localName)) {
               inTITLE++;
             } else {
-              if (equalsIgnoreTurkish("meta", localName)) {
+              if ("meta".equals(localName)) {
                 String name = atts.getValue("name");
                 if (name == null) {
                   name = atts.getValue("http-equiv");
@@ -82,7 +83,7 @@ public class DemoHTMLParser implements H
           } else if (inBODY > 0) {
             if (SUPPRESS_ELEMENTS.contains(localName)) {
               suppressed++;
-            } else if (equalsIgnoreTurkish("img", localName)) {
+            } else if ("img".equals(localName)) {
               // the original javacc-based parser preserved <IMG alt="..."/>
               // attribute as body text in [] parenthesis:
               final String alt = atts.getValue("alt");
@@ -90,11 +91,11 @@ public class DemoHTMLParser implements H
                 body.append('[').append(alt).append(']');
               }
             }
-          } else if (equalsIgnoreTurkish("body", localName)) {
+          } else if ("body".equals(localName)) {
             inBODY++;
-          } else if (equalsIgnoreTurkish("head", localName)) {
+          } else if ("head".equals(localName)) {
             inHEAD++;
-          } else if (equalsIgnoreTurkish("frameset", localName)) {
+          } else if ("frameset".equals(localName)) {
             throw new SAXException("This parser does not support HTML framesets.");
           }
         }
@@ -102,7 +103,7 @@ public class DemoHTMLParser implements H
         @Override
         public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
           if (inBODY > 0) {
-            if (equalsIgnoreTurkish("body", localName)) {
+            if ("body".equals(localName)) {
               inBODY--;
             } else if (ENDLINE_ELEMENTS.contains(localName)) {
               body.append('\n');
@@ -110,9 +111,9 @@ public class DemoHTMLParser implements H
               suppressed--;
             }
           } else if (inHEAD > 0) {
-            if (equalsIgnoreTurkish("head", localName)) {
+            if ("head".equals(localName)) {
               inHEAD--;
-            } else if (inTITLE > 0 && equalsIgnoreTurkish("title", localName)) {
+            } else if (inTITLE > 0 && "title".equals(localName)) {
               inTITLE--;
             }
           }
@@ -145,36 +146,8 @@ public class DemoHTMLParser implements H
       this.body = body.toString();
     }
     
-    // TODO: remove the Turkish workaround once this is fixed in NekoHTML:
-    // https://sourceforge.net/tracker/?func=detail&aid=3544334&group_id=195122&atid=952178
-    
-    // BEGIN: workaround
-    static final String convertTurkish(String s) {
-      return s.replace('i', 'ı');
-    }
-    
-    static final boolean equalsIgnoreTurkish(String s1, String s2) {
-      final int len1 = s1.length(), len2 = s2.length();
-      if (len1 != len2)
-        return false;
-      for (int i = 0; i < len1; i++) {
-        char ch1 = s1.charAt(i), ch2 = s2.charAt(i);
-        if (ch1 == 'ı') ch1 = 'i';
-        if (ch2 == 'ı') ch2 = 'i';
-        if (ch1 != ch2)
-          return false;
-      }
-      return true;
-    }
-    // END: workaround
-    
-    static final Set<String> createElementNameSet(String... names) {
-      final HashSet<String> set = new HashSet<String>();
-      for (final String name : names) {
-        set.add(name);
-        set.add(convertTurkish(name));
-      }
-      return Collections.unmodifiableSet(set);
+    private static final Set<String> createElementNameSet(String... names) {
+      return Collections.unmodifiableSet(new HashSet<String>(Arrays.asList(names)));
     }
     
     /** HTML elements that cause a line break (they are block-elements) */