You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/04/22 21:41:58 UTC
svn commit: r1328949 - in /lucene/dev/trunk: dev-tools/scripts/
lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/
lucene/core/src/java/org/apache/lucene/index/
lucene/core/src/java/org/apache/lucene/search/similarities/ l...
Author: mikemccand
Date: Sun Apr 22 19:41:57 2012
New Revision: 1328949
URL: http://svn.apache.org/viewvc?rev=1328949&view=rev
Log:
add simple python script to find broken javadocs links; fix some HTML escaping in javadocs
Added:
lucene/dev/trunk/dev-tools/scripts/checkJavadocLinks.py (with props)
Modified:
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FieldCacheSanityChecker.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java
lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java
lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CompactLabelToOrdinal.java
Added: lucene/dev/trunk/dev-tools/scripts/checkJavadocLinks.py
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/dev-tools/scripts/checkJavadocLinks.py?rev=1328949&view=auto
==============================================================================
--- lucene/dev/trunk/dev-tools/scripts/checkJavadocLinks.py (added)
+++ lucene/dev/trunk/dev-tools/scripts/checkJavadocLinks.py Sun Apr 22 19:41:57 2012
@@ -0,0 +1,156 @@
+import traceback
+import os
+import sys
+import re
+from HTMLParser import HTMLParser, HTMLParseError
+import urlparse
+
+reHyperlink = re.compile(r'<a(\s+.*?)>', re.I)
+reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I)
+
+# silly emacs: '
+
+class FindHyperlinks(HTMLParser):
+
+ def __init__(self, baseURL):
+ HTMLParser.__init__(self)
+ self.anchors = set()
+ self.links = []
+ self.baseURL = baseURL
+ self.printed = False
+
+ def handle_starttag(self, tag, attrs):
+ if tag == 'a':
+ name = None
+ href = None
+ for attName, attValue in attrs:
+ if attName == 'name':
+ name = attValue
+ elif attName == 'href':
+ href = attValue
+
+ if name is not None:
+ assert href is None
+ if name in self.anchors:
+ if name in ('serializedForm',
+ 'serialized_methods',
+ 'readObject(java.io.ObjectInputStream)',
+ 'writeObject(java.io.ObjectOutputStream)') \
+ and self.baseURL.endswith('/serialized-form.html'):
+ # Seems like a bug in Javadoc generation... you can't have
+ # same anchor name more than once...
+ pass
+ else:
+ self.printFile()
+ print ' WARNING: anchor "%s" appears more than once' % name
+ else:
+ self.anchors.add(name)
+ elif href is not None:
+ assert name is None
+ self.links.append(urlparse.urljoin(self.baseURL, href))
+ else:
+ if self.baseURL.endswith('/AttributeSource.html'):
+ # LUCENE-4010: AttributeSource's javadocs has an unescaped <A> generics!! Seems to be a javadocs bug... (fixed in Java 7)
+ pass
+ else:
+ raise RuntimeError('BUG: %s' % attrs)
+
+ def printFile(self):
+ if not self.printed:
+ print
+ print ' ' + self.baseURL
+ self.printed = True
+
+def parse(baseURL, html):
+ parser = FindHyperlinks(baseURL)
+ try:
+ parser.feed(html)
+ parser.close()
+ except HTMLParseError:
+ parser.printFile()
+ print ' WARNING: failed to parse:'
+ traceback.print_exc()
+ return [], []
+
+ #print ' %d links, %d anchors' % \
+ # (len(parser.links), len(parser.anchors))
+ return parser.links, parser.anchors
+
+def checkAll(dirName):
+ """
+ Checks *.html (recursively) under this directory.
+ """
+
+ # Find/parse all HTML files first
+ print
+ print 'Crawl/parse...'
+ allFiles = {}
+
+ if os.path.isfile(dirName):
+ root, fileName = os.path.split(dirName)
+ iter = ((root, [], [fileName]),)
+ else:
+ iter = os.walk(dirName)
+
+ for root, dirs, files in iter:
+ for f in files:
+ main, ext = os.path.splitext(f)
+ ext = ext.lower()
+
+ # maybe?:
+ # and main not in ('serialized-form'):
+ if ext in ('.htm', '.html') and \
+ not f.startswith('.#') and \
+ main not in ('deprecated-list',):
+ # Somehow even w/ java 7 generaged javadocs,
+ # deprecated-list.html can fail to escape generics types
+ fullPath = os.path.join(root, f)
+ #print ' %s' % fullPath
+ allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f)).read())
+
+ # ... then verify:
+ print
+ print 'Verify...'
+ for fullPath, (links, anchors) in allFiles.items():
+ #print fullPath
+ printed = False
+ for link in links:
+
+ origLink = link
+
+ # TODO: use urlparse?
+ idx = link.find('#')
+ if idx != -1:
+ anchor = link[idx+1:]
+ link = link[:idx]
+ else:
+ anchor = None
+
+ idx = link.find('?')
+ if idx != -1:
+ link = link[:idx]
+
+ # TODO: normalize path sep for windows...
+ if link.startswith('http://') or link.startswith('https://'):
+ # don't check external links
+ pass
+ elif link not in allFiles:
+ # We only load HTML... so if the link is another resource (eg
+ # SweetSpotSimilarity refs
+ # lucene/build/docs/misc/org/apache/lucene/misc/doc-files/ss.gnuplot) then it's OK:
+ if not os.path.exists(link):
+ if not printed:
+ printed = True
+ print
+ print fullPath
+ print ' BROKEN LINK: %s' % link
+ elif anchor is not None and anchor not in allFiles[link][1]:
+ if not printed:
+ printed = True
+ print
+ print fullPath
+ print ' BROKEN ANCHOR: %s' % origLink
+
+if __name__ == '__main__':
+ checkAll(sys.argv[1])
+
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java?rev=1328949&r1=1328948&r2=1328949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java Sun Apr 22 19:41:57 2012
@@ -191,7 +191,7 @@ public class HyphenationTree extends Ter
* interletter values. In other words, it does something like:
* </p>
* <code>
- * for(i=0; i<patterns.length; i++) {
+ * for(i=0; i<patterns.length; i++) {
* if ( word.substring(index).startsWidth(patterns[i]) )
* update_interletter_values(patterns[i]);
* }
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java?rev=1328949&r1=1328948&r2=1328949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java Sun Apr 22 19:41:57 2012
@@ -54,7 +54,7 @@ import java.util.ArrayList;
* merge fewer segments (down to 1 at once, if that one has
* deletions) to keep the segment size under budget.
*
- * <p<b>NOTE</b>: this policy freely merges non-adjacent
+ * <p><b>NOTE</b>: this policy freely merges non-adjacent
* segments; if this is a problem, use {@link
* LogMergePolicy}.
*
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java?rev=1328949&r1=1328948&r2=1328949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java Sun Apr 22 19:41:57 2012
@@ -103,7 +103,7 @@ import org.apache.lucene.util.SmallFloat
* </table>
* </td></tr>
* <tr><td>
- * <center><font=-1><u>VSM Score</u></font></center>
+ * <center><font size=-1><u>VSM Score</u></font></center>
* </td></tr>
* </table>
* <br> <br>
@@ -194,7 +194,7 @@ import org.apache.lucene.util.SmallFloat
* </table>
* </td></tr>
* <tr><td>
- * <center><font=-1><u>Lucene Conceptual Scoring Formula</u></font></center>
+ * <center><font size=-1><u>Lucene Conceptual Scoring Formula</u></font></center>
* </td></tr>
* </table>
* <br> <br>
@@ -291,7 +291,7 @@ import org.apache.lucene.util.SmallFloat
* </table>
* </td></tr>
* <tr><td>
- * <center><font=-1><u>Lucene Practical Scoring Function</u></font></center>
+ * <center><font size=-1><u>Lucene Practical Scoring Function</u></font></center>
* </td></tr>
* </table>
*
@@ -410,7 +410,7 @@ import org.apache.lucene.util.SmallFloat
* computes this value as:
*
* <br> <br>
- * <table cellpadding="1" cellspacing="0" border="0"n align="center" style="width:auto">
+ * <table cellpadding="1" cellspacing="0" border="0" align="center" style="width:auto">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.search.Weight#getValueForNormalization() sumOfSquaredWeights} =
@@ -476,7 +476,7 @@ import org.apache.lucene.util.SmallFloat
* If the document has multiple fields with the same name, all their boosts are multiplied together:
*
* <br> <br>
- * <table cellpadding="1" cellspacing="0" border="0"n align="center" style="width:auto">
+ * <table cellpadding="1" cellspacing="0" border="0" align="center" style="width:auto">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* norm(t,d) =
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java?rev=1328949&r1=1328948&r2=1328949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java Sun Apr 22 19:41:57 2012
@@ -30,7 +30,7 @@ import org.apache.lucene.util.ByteBlockP
/**
* {@link BytesRefHash} is a special purpose hash-map like data-structure
* optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of
- * byte arrays to ordinal (Map<BytesRef,int>) storing the hashed bytes
+ * byte arrays to ordinal (Map<BytesRef,int>) storing the hashed bytes
* efficiently in continuous storage. The mapping to the ordinal is
* encapsulated inside {@link BytesRefHash} and is guaranteed to be increased
* for each added {@link BytesRef}.
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FieldCacheSanityChecker.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FieldCacheSanityChecker.java?rev=1328949&r1=1328948&r2=1328949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FieldCacheSanityChecker.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FieldCacheSanityChecker.java Sun Apr 22 19:41:57 2012
@@ -409,7 +409,7 @@ public final class FieldCacheSanityCheck
* it's typically an indication of a possible problem.
* </p>
* <p>
- * <bPNOTE:</b> Only the reader, fieldname, and cached value are actually
+ * <b>NOTE:</b> Only the reader, fieldname, and cached value are actually
* tested -- if two cache entries have different parsers or datatypes but
* the cached values are the same Object (== not just equal()) this method
* does not consider that a red flag. This allows for subtle variations
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java?rev=1328949&r1=1328948&r2=1328949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java Sun Apr 22 19:41:57 2012
@@ -253,7 +253,7 @@ public final class PagedBytes {
}
}
- /** 1<<blockBits must be bigger than biggest single
+ /** 1<<blockBits must be bigger than biggest single
* BytesRef slice that will be pulled */
public PagedBytes(int blockBits) {
this.blockSize = 1 << blockBits;
Modified: lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java?rev=1328949&r1=1328948&r2=1328949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java (original)
+++ lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java Sun Apr 22 19:41:57 2012
@@ -158,7 +158,7 @@ public abstract class FacetRequest imple
}
/**
- * If getNumLabel()<getNumResults(), only the first getNumLabel() results
+ * If getNumLabel() < getNumResults(), only the first getNumLabel() results
* will have their category paths calculated, and the rest will only be
* available as ordinals (category numbers) and will have null paths.
* <P>
Modified: lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CompactLabelToOrdinal.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CompactLabelToOrdinal.java?rev=1328949&r1=1328948&r2=1328949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CompactLabelToOrdinal.java (original)
+++ lucene/dev/trunk/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CompactLabelToOrdinal.java Sun Apr 22 19:41:57 2012
@@ -1,17 +1,5 @@
package org.apache.lucene.facet.taxonomy.writercache.cl2o;
-import java.io.BufferedInputStream;
-import java.io.BufferedOutputStream;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.apache.lucene.facet.taxonomy.CategoryPath;
-
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -29,6 +17,20 @@ import org.apache.lucene.facet.taxonomy.
* limitations under the License.
*/
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+
+// TODO: maybe this could use an FST instead...
+
/**
* This is a very efficient LabelToOrdinal implementation that uses a
* CharBlockArray to store all labels and a configurable number of HashArrays to
@@ -47,7 +49,7 @@ import org.apache.lucene.facet.taxonomy.
*
* <p>
* This data structure has a much lower memory footprint (~30%) compared to a
- * Java HashMap<String, Integer>. It also only uses a small fraction of objects
+ * Java HashMap<String, Integer>. It also only uses a small fraction of objects
* a HashMap would use, thus limiting the GC overhead. Ingestion speed was also
* ~50% faster compared to a HashMap for 3M unique labels.
*