You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/04/22 21:41:58 UTC

svn commit: r1328949 - in /lucene/dev/trunk: dev-tools/scripts/ lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/ lucene/core/src/java/org/apache/lucene/index/ lucene/core/src/java/org/apache/lucene/search/similarities/ l...

Author: mikemccand
Date: Sun Apr 22 19:41:57 2012
New Revision: 1328949

URL: http://svn.apache.org/viewvc?rev=1328949&view=rev
Log:
add simple python script to find broken javadocs links; fix some HTML escaping in javadocs

Added:
    lucene/dev/trunk/dev-tools/scripts/checkJavadocLinks.py   (with props)
Modified:
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FieldCacheSanityChecker.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java
    lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java
    lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CompactLabelToOrdinal.java

Added: lucene/dev/trunk/dev-tools/scripts/checkJavadocLinks.py
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/dev-tools/scripts/checkJavadocLinks.py?rev=1328949&view=auto
==============================================================================
--- lucene/dev/trunk/dev-tools/scripts/checkJavadocLinks.py (added)
+++ lucene/dev/trunk/dev-tools/scripts/checkJavadocLinks.py Sun Apr 22 19:41:57 2012
@@ -0,0 +1,156 @@
+import traceback
+import os
+import sys
+import re
+from HTMLParser import HTMLParser, HTMLParseError
+import urlparse
+
+reHyperlink = re.compile(r'<a(\s+.*?)>', re.I)
+reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I)
+
+# silly emacs: '
+
+class FindHyperlinks(HTMLParser):
+
+  def __init__(self, baseURL):
+    HTMLParser.__init__(self)
+    self.anchors = set()
+    self.links = []
+    self.baseURL = baseURL
+    self.printed = False
+
+  def handle_starttag(self, tag, attrs):
+    if tag == 'a':
+      name = None
+      href = None
+      for attName, attValue in attrs:
+        if attName == 'name':
+          name = attValue
+        elif attName == 'href':
+          href = attValue
+
+      if name is not None:
+        assert href is None
+        if name in self.anchors:
+          if name in ('serializedForm',
+                      'serialized_methods',
+                      'readObject(java.io.ObjectInputStream)',
+                      'writeObject(java.io.ObjectOutputStream)') \
+                      and self.baseURL.endswith('/serialized-form.html'):
+            # Seems like a bug in Javadoc generation... you can't have
+            # same anchor name more than once...
+            pass
+          else:
+            self.printFile()
+            print '    WARNING: anchor "%s" appears more than once' % name
+        else:
+          self.anchors.add(name)
+      elif href is not None:
+        assert name is None
+        self.links.append(urlparse.urljoin(self.baseURL, href))
+      else:
+        if self.baseURL.endswith('/AttributeSource.html'):
+          # LUCENE-4010: AttributeSource's javadocs has an unescaped <A> generics!!  Seems to be a javadocs bug... (fixed in Java 7)
+          pass
+        else:
+          raise RuntimeError('BUG: %s' % attrs)
+
+  def printFile(self):
+    if not self.printed:
+      print
+      print '  ' + self.baseURL
+      self.printed = True
+                   
+def parse(baseURL, html):
+  parser = FindHyperlinks(baseURL)
+  try:
+    parser.feed(html)
+    parser.close()
+  except HTMLParseError:
+    parser.printFile()
+    print '  WARNING: failed to parse:'
+    traceback.print_exc()
+    return [], []
+  
+  #print '    %d links, %d anchors' % \
+  #      (len(parser.links), len(parser.anchors))
+  return parser.links, parser.anchors
+
+def checkAll(dirName):
+  """
+  Checks *.html (recursively) under this directory.
+  """
+
+  # Find/parse all HTML files first
+  print
+  print 'Crawl/parse...'
+  allFiles = {}
+
+  if os.path.isfile(dirName):
+    root, fileName = os.path.split(dirName)
+    iter = ((root, [], [fileName]),)
+  else:
+    iter = os.walk(dirName)
+
+  for root, dirs, files in iter:
+    for f in files:
+      main, ext = os.path.splitext(f)
+      ext = ext.lower()
+
+      # maybe?:
+      # and main not in ('serialized-form'):
+      if ext in ('.htm', '.html') and \
+         not f.startswith('.#') and \
+         main not in ('deprecated-list',):
+        # Somehow even w/ java 7 generaged javadocs,
+        # deprecated-list.html can fail to escape generics types
+        fullPath = os.path.join(root, f)
+        #print '  %s' % fullPath
+        allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f)).read())
+
+  # ... then verify:
+  print
+  print 'Verify...'
+  for fullPath, (links, anchors) in allFiles.items():
+    #print fullPath
+    printed = False
+    for link in links:
+
+      origLink = link
+
+      # TODO: use urlparse?
+      idx = link.find('#')
+      if idx != -1:
+        anchor = link[idx+1:]
+        link = link[:idx]
+      else:
+        anchor = None
+
+      idx = link.find('?')
+      if idx != -1:
+        link = link[:idx]
+        
+      # TODO: normalize path sep for windows...
+      if link.startswith('http://') or link.startswith('https://'):
+        # don't check external links
+        pass
+      elif link not in allFiles:
+        # We only load HTML... so if the link is another resource (eg
+        # SweetSpotSimilarity refs
+        # lucene/build/docs/misc/org/apache/lucene/misc/doc-files/ss.gnuplot) then it's OK:
+        if not os.path.exists(link):
+          if not printed:
+            printed = True
+            print
+            print fullPath
+          print '  BROKEN LINK: %s' % link
+      elif anchor is not None and anchor not in allFiles[link][1]:
+        if not printed:
+          printed = True
+          print
+          print fullPath
+        print '  BROKEN ANCHOR: %s' % origLink
+        
+if __name__ == '__main__':
+  checkAll(sys.argv[1])
+  

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java?rev=1328949&r1=1328948&r2=1328949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java Sun Apr 22 19:41:57 2012
@@ -191,7 +191,7 @@ public class HyphenationTree extends Ter
    * interletter values. In other words, it does something like:
    * </p>
    * <code>
-   * for(i=0; i<patterns.length; i++) {
+   * for(i=0; i&lt;patterns.length; i++) {
    * if ( word.substring(index).startsWidth(patterns[i]) )
    * update_interletter_values(patterns[i]);
    * }

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java?rev=1328949&r1=1328948&r2=1328949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java Sun Apr 22 19:41:57 2012
@@ -54,7 +54,7 @@ import java.util.ArrayList;
  *  merge fewer segments (down to 1 at once, if that one has
  *  deletions) to keep the segment size under budget.
  *      
- *  <p<b>NOTE</b>: this policy freely merges non-adjacent
+ *  <p><b>NOTE</b>: this policy freely merges non-adjacent
  *  segments; if this is a problem, use {@link
  *  LogMergePolicy}.
  *

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java?rev=1328949&r1=1328948&r2=1328949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java Sun Apr 22 19:41:57 2012
@@ -103,7 +103,7 @@ import org.apache.lucene.util.SmallFloat
  *    </table>
  *    </td></tr>
  *    <tr><td>
- *    <center><font=-1><u>VSM Score</u></font></center>
+ *    <center><font size=-1><u>VSM Score</u></font></center>
  *    </td></tr>
  *  </table>
  *  <br>&nbsp;<br>
@@ -194,7 +194,7 @@ import org.apache.lucene.util.SmallFloat
  *    </table>
  *    </td></tr>
  *    <tr><td>
- *    <center><font=-1><u>Lucene Conceptual Scoring Formula</u></font></center>
+ *    <center><font size=-1><u>Lucene Conceptual Scoring Formula</u></font></center>
  *    </td></tr>
  *  </table>
  *  <br>&nbsp;<br>
@@ -291,7 +291,7 @@ import org.apache.lucene.util.SmallFloat
  *  </table>
  * </td></tr>
  * <tr><td>
- *  <center><font=-1><u>Lucene Practical Scoring Function</u></font></center>
+ *  <center><font size=-1><u>Lucene Practical Scoring Function</u></font></center>
  * </td></tr>
  * </table>
  *
@@ -410,7 +410,7 @@ import org.apache.lucene.util.SmallFloat
  *      computes this value as:
  *
  *      <br>&nbsp;<br>
- *      <table cellpadding="1" cellspacing="0" border="0"n align="center" style="width:auto">
+ *      <table cellpadding="1" cellspacing="0" border="0" align="center" style="width:auto">
  *        <tr>
  *          <td valign="middle" align="right" rowspan="1">
  *            {@link org.apache.lucene.search.Weight#getValueForNormalization() sumOfSquaredWeights} &nbsp; = &nbsp;
@@ -476,7 +476,7 @@ import org.apache.lucene.util.SmallFloat
  *      If the document has multiple fields with the same name, all their boosts are multiplied together:
  *
  *      <br>&nbsp;<br>
- *      <table cellpadding="1" cellspacing="0" border="0"n align="center" style="width:auto">
+ *      <table cellpadding="1" cellspacing="0" border="0" align="center" style="width:auto">
  *        <tr>
  *          <td valign="middle" align="right" rowspan="1">
  *            norm(t,d) &nbsp; = &nbsp;

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java?rev=1328949&r1=1328948&r2=1328949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java Sun Apr 22 19:41:57 2012
@@ -30,7 +30,7 @@ import org.apache.lucene.util.ByteBlockP
 /**
  * {@link BytesRefHash} is a special purpose hash-map like data-structure
  * optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of
- * byte arrays to ordinal (Map<BytesRef,int>) storing the hashed bytes
+ * byte arrays to ordinal (Map&lt;BytesRef,int&gt;) storing the hashed bytes
  * efficiently in continuous storage. The mapping to the ordinal is
  * encapsulated inside {@link BytesRefHash} and is guaranteed to be increased
  * for each added {@link BytesRef}.

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FieldCacheSanityChecker.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FieldCacheSanityChecker.java?rev=1328949&r1=1328948&r2=1328949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FieldCacheSanityChecker.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FieldCacheSanityChecker.java Sun Apr 22 19:41:57 2012
@@ -409,7 +409,7 @@ public final class FieldCacheSanityCheck
      * it's typically an indication of a possible problem.
      * </p>
      * <p>
-     * <bPNOTE:</b> Only the reader, fieldname, and cached value are actually 
+     * <b>NOTE:</b> Only the reader, fieldname, and cached value are actually 
      * tested -- if two cache entries have different parsers or datatypes but 
      * the cached values are the same Object (== not just equal()) this method 
      * does not consider that a red flag.  This allows for subtle variations 

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java?rev=1328949&r1=1328948&r2=1328949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java Sun Apr 22 19:41:57 2012
@@ -253,7 +253,7 @@ public final class PagedBytes {
     }
   }
 
-  /** 1<<blockBits must be bigger than biggest single
+  /** 1&lt;&lt;blockBits must be bigger than biggest single
    *  BytesRef slice that will be pulled */
   public PagedBytes(int blockBits) {
     this.blockSize = 1 << blockBits;

Modified: lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java?rev=1328949&r1=1328948&r2=1328949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java (original)
+++ lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java Sun Apr 22 19:41:57 2012
@@ -158,7 +158,7 @@ public abstract class FacetRequest imple
   }
 
   /**
-   * If getNumLabel()<getNumResults(), only the first getNumLabel() results
+   * If getNumLabel() &lt; getNumResults(), only the first getNumLabel() results
    * will have their category paths calculated, and the rest will only be
    * available as ordinals (category numbers) and will have null paths.
    * <P>

Modified: lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CompactLabelToOrdinal.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CompactLabelToOrdinal.java?rev=1328949&r1=1328948&r2=1328949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CompactLabelToOrdinal.java (original)
+++ lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CompactLabelToOrdinal.java Sun Apr 22 19:41:57 2012
@@ -1,17 +1,5 @@
 package org.apache.lucene.facet.taxonomy.writercache.cl2o;
 
-import java.io.BufferedInputStream;
-import java.io.BufferedOutputStream;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.apache.lucene.facet.taxonomy.CategoryPath;
-
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -29,6 +17,20 @@ import org.apache.lucene.facet.taxonomy.
  * limitations under the License.
  */
 
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+
+// TODO: maybe this could use an FST instead...
+
 /**
  * This is a very efficient LabelToOrdinal implementation that uses a
  * CharBlockArray to store all labels and a configurable number of HashArrays to
@@ -47,7 +49,7 @@ import org.apache.lucene.facet.taxonomy.
  * 
  * <p>
  * This data structure has a much lower memory footprint (~30%) compared to a
- * Java HashMap<String, Integer>. It also only uses a small fraction of objects
+ * Java HashMap&lt;String, Integer&gt;. It also only uses a small fraction of objects
  * a HashMap would use, thus limiting the GC overhead. Ingestion speed was also
  * ~50% faster compared to a HashMap for 3M unique labels.
  *