You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by va...@apache.org on 2009/01/09 04:28:41 UTC
svn commit: r732916 [11/14] - in /lucene/pylucene/trunk: ./ java/ java/org/
java/org/osafoundation/ java/org/osafoundation/lucene/
java/org/osafoundation/lucene/analysis/
java/org/osafoundation/lucene/queryParser/
java/org/osafoundation/lucene/search/ ...
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/RTF.rtf
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/RTF.rtf?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/RTF.rtf (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/RTF.rtf Thu Jan 8 19:28:33 2009
@@ -0,0 +1,25 @@
+{\rtf1\ansi\deff1\adeflang1025
+{\fonttbl{\f0\froman\fprq2\fcharset0 Nimbus Roman No9 L;}{\f1\froman\fprq2\fcharset0 Times New Roman;}{\f2\froman\fprq2\fcharset0 Times New Roman;}}
+{\colortbl;\red0\green0\blue0;\red128\green128\blue128;}
+{\stylesheet{\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af1\afs24\lang1033\ltrch\dbch\af1\afs24\langfe1033\loch\f1\fs24\lang1033\snext1 Default;}
+{\s2\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\aspalpha\rtlch\lang1025\ltrch\dbch\af1\loch\f1\sbasedon1\snext2 Normal;}
+{\s3\cf0\tqc\tx4320\tqr\tx8640{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af1\afs24\lang1033\ltrch\dbch\af1\afs24\langfe1033\loch\f1\fs24\lang1033\sbasedon1\snext3 Header;}
+{\s4\cf0\tqc\tx4320\tqr\tx8640{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af1\afs24\lang1033\ltrch\dbch\af1\afs24\langfe1033\loch\f1\fs24\lang1033\sbasedon1\snext4 Footer;}
+{\*\cs6\cf0\rtlch\lang1033\ltrch\dbch\af1\loch\f1\sbasedon7 Default Paragraph Font;}
+{\*\cs7\cf0\rtlch\lang1025\ltrch\dbch\af1\loch\f1 Normal;}
+}
+{\info{\comment StarWriter}{\vern6410}}\deftab720
+{\*\pgdsctbl
+{\pgdsc0\pgdscuse195\pgwsxn12240\pghsxn15840\marglsxn1800\margrsxn1800\margtsxn1440\margbsxn1440\pgdscnxt0 Default;}
+{\pgdsc1\pgdscuse195\pgwsxn12240\pghsxn15840\marglsxn1800\margrsxn1800\margtsxn720\margbsxn720\headery0{\*\headeryb0\headerxl0\headerxr0\headeryh720}{\header \pard\plain \s3\cf0\tqc\tx4320\tqr\tx8640{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af1\afs24\lang1033\ltrch\dbch\af1\afs24\langfe1033\loch\f1\fs24\lang1033
+\par }
+\footery0{\*\footeryt720\footerxl0\footerxr0\footeryh0}{\footer \pard\plain \s4\cf0\tqc\tx4320\tqr\tx8640{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af1\afs24\lang1033\ltrch\dbch\af1\afs24\langfe1033\loch\f1\fs24\lang1033
+\par }
+\pgdscnxt1 Convert 1;}}
+{\*\pgdscno1}\paperh15840\paperw12240\margl1800\margr1800\margt720\margb720\sectd\sbknone\pgwsxn12240\pghsxn15840\marglsxn1800\margrsxn1800\margtsxn1440\margbsxn1728\headery720{\header \pard\plain \s3\cf0\tqc\tx4320\tqr\tx8640{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af1\afs24\lang1033\ltrch\dbch\af1\afs24\langfe1033\loch\f1\fs24\lang1033
+\par }
+\footery720{\footer \pard\plain \s4\cf0\tqc\tx4320\tqr\tx8640{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af1\afs24\lang1033\ltrch\dbch\af1\afs24\langfe1033\loch\f1\fs24\lang1033
+\par }
+\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
+\pard\plain \s2\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\aspalpha\rtlch\lang1025\ltrch\dbch\af1\loch\f1 {\ltrch\loch\f1 This is the content of the RTF document}
+\par }
\ No newline at end of file
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook-entry.xml
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook-entry.xml?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook-entry.xml (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook-entry.xml Thu Jan 8 19:28:33 2009
@@ -0,0 +1,12 @@
+<?xml version='1.0' encoding='utf-8'?>
+<address-book>
+ <contact type="individual">
+ <name>Zane Pasolini</name>
+ <address>999 W. Prince St.</address>
+ <city>New York</city>
+ <province>NY</province>
+ <postalcode>10013</postalcode>
+ <country>USA</country>
+ <telephone>+1 212 345 6789</telephone>
+ </contact>
+</address-book>
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook-entry.xml
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook-entry.xml
------------------------------------------------------------------------------
svn:mime-type = text/xml
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook.xml
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook.xml?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook.xml (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook.xml Thu Jan 8 19:28:33 2009
@@ -0,0 +1,21 @@
+<?xml version='1.0' encoding='utf-8'?>
+<address-book>
+ <contact type="individual">
+ <name>Zane Pasolini</name>
+ <address>999 W. Prince St.</address>
+ <city>New York</city>
+ <province>NY</province>
+ <postalcode>10013</postalcode>
+ <country>USA</country>
+ <telephone>+1 212 345 6789</telephone>
+ </contact>
+ <contact type="business">
+ <name>SAMOFIX d.o.o.</name>
+ <address>Ilica 47-2</address>
+ <city>Zagreb</city>
+ <province></province>
+ <postalcode>10000</postalcode>
+ <country>Croatia</country>
+ <telephone>+385 1 123 4567</telephone>
+ </contact>
+</address-book>
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook.xml
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook.xml
------------------------------------------------------------------------------
svn:mime-type = text/xml
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/FileIndexer.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/FileIndexer.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/FileIndexer.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/FileIndexer.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,133 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+import lia.handlingtypes as handlingtypes
+
+from time import time
+from datetime import timedelta
+from lucene import IndexWriter, StandardAnalyzer
+
+from lia.util.ClassLoader import ClassLoader
+
+ #
+ # A File Indexer capable of recursively indexing a directory tree.
+ # Based on lia.meetlucene.Indexer, but handling more than plaintext.
+ #
+
+class FileIndexer(object):
+
+ def main(cls, argv):
+
+ if len(argv) != 3:
+ print "Usage: python FileIndexer.py <index dir> <data dir>"
+ return
+
+ indexDir = argv[1]
+ dataDir = argv[2]
+
+ propsFile = os.path.join(os.path.dirname(handlingtypes.__file__),
+ 'framework', 'handler.properties')
+ input = file(propsFile)
+ props = {}
+ while True:
+ line = input.readline().strip()
+ if not line:
+ break
+ if line.startswith('#'):
+ continue
+ name, value = line.split('=')
+ props[name.strip()] = value.strip()
+ input.close()
+ cls.handlerProps = props
+
+ start = time()
+ numIndexed = cls.index(indexDir, dataDir)
+ duration = timedelta(seconds=time() - start)
+
+ print "Indexing %s files took %s" %(numIndexed, duration)
+
+ def index(cls, indexDir, dataDir):
+
+ if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
+ raise IOError, "%s does not exist or is not a directory" %(dataDir)
+
+ writer = IndexWriter(indexDir, StandardAnalyzer(), True)
+ writer.setUseCompoundFile(False)
+
+ numIndexed = cls.indexDirectory(writer, dataDir)
+ writer.optimize()
+ writer.close()
+
+ return numIndexed
+
+ def indexDirectory(cls, writer, dir):
+
+ count = 0
+ dirs = []
+
+ for name in os.listdir(dir):
+ path = os.path.join(dir, name)
+ if os.path.isfile(path):
+ doc = cls.indexFile(writer, path)
+ if doc is not None:
+ count += 1
+ elif os.path.isdir(path) and not name.startswith('.'):
+ dirs.append(path)
+
+ for dir in dirs:
+ count += cls.indexDirectory(writer, dir)
+
+ return count
+
+ def indexFile(cls, writer, path):
+
+ name, ext = os.path.splitext(path)
+ if ext.startswith(os.path.extsep):
+ ext = ext[len(os.path.extsep):]
+
+ if ext:
+ handlerClassName = cls.handlerProps.get(ext, None)
+ if handlerClassName is None:
+ print "error indexing %s: no handler for %s files" %(path, ext)
+ return None
+
+ try:
+ handlerClass = ClassLoader.loadClass(handlerClassName)
+ handler = handlerClass()
+
+ doc = handler.indexFile(writer, path)
+ if doc is not None:
+ print 'indexed', path
+
+ return doc
+ except SyntaxError:
+ raise
+ except Exception, e:
+ print 'error indexing %s: %s' %(path, e)
+ return None
+
+ main = classmethod(main)
+ index = classmethod(index)
+ indexDirectory = classmethod(indexDirectory)
+ indexFile = classmethod(indexFile)
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/FileIndexer.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/FileIndexer.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/__init__.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/__init__.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/__init__.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/__init__.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1 @@
+# framework package
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/__init__.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/__init__.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/handler.properties
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/handler.properties?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/handler.properties (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/handler.properties Thu Jan 8 19:28:33 2009
@@ -0,0 +1,6 @@
+txt = lia.handlingtypes.text.PlainTextHandler.PlainTextHandler
+html = lia.handlingtypes.html.HTMLHandler.HTMLHandler
+pdf = lia.handlingtypes.pdf.PDFHandler.PDFHandler
+xml = lia.handlingtypes.xml.DigesterXMLHandler.DigesterXMLHandler
+doc = lia.handlingtypes.msdoc.AntiWordHandler.AntiWordHandler
+#rtf = lia.handlingtypes.rtf.JavaBuiltInRTFHandler.JavaBuiltInRTFHandler
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/HTMLHandler.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/HTMLHandler.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/HTMLHandler.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/HTMLHandler.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,47 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os, popen2
+
+from lucene import Document, Field, StringReader
+from lia.util.Streams import HTMLReader, InputStreamReader
+
+
+class HTMLHandler(object):
+
+ def indexFile(self, writer, path):
+
+ try:
+ file = open(path)
+ string = HTMLReader(InputStreamReader(file, 'utf-8')).read()
+ file.close()
+ except:
+ raise
+ else:
+ doc = Document()
+ doc.add(Field("contents", StringReader(string)))
+ doc.add(Field("filename", os.path.abspath(path),
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ writer.addDocument(doc)
+
+ return doc
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/HTMLHandler.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/HTMLHandler.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/__init__.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/__init__.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/__init__.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/__init__.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1 @@
+# html package
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/__init__.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/__init__.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/AntiWordHandler.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/AntiWordHandler.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/AntiWordHandler.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/AntiWordHandler.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,51 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os, popen2
+
+from lucene import Document, Field, StringReader
+from lia.util.Streams import InputStreamReader
+
+
+class AntiWordHandler(object):
+
+ def indexFile(self, writer, path):
+
+ doc = Document()
+
+ try:
+ process = popen2.Popen4(["antiword", "-m", "UTF-8", path])
+ string = InputStreamReader(process.fromchild, 'utf-8').read()
+ except:
+ raise
+ else:
+ doc.add(Field("contents", StringReader(string)))
+ doc.add(Field("filename", os.path.abspath(path),
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ writer.addDocument(doc)
+
+ exitCode = process.wait()
+ if exitCode != 0:
+ raise RuntimeError, "pdftotext exit code %d" %(exitCode)
+
+ return doc
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/AntiWordHandler.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/AntiWordHandler.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/__init__.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/__init__.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/__init__.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/__init__.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1 @@
+# msdoc package
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/__init__.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/__init__.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/PDFHandler.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/PDFHandler.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/PDFHandler.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/PDFHandler.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,68 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os, popen2
+
+from lucene import Document, Field, StringReader
+from lia.util.Streams import InputStreamReader
+
+
+class PDFHandler(object):
+
+ def indexFile(self, writer, path):
+
+ doc = Document()
+
+ try:
+ process = popen2.Popen4(["pdfinfo", "-enc", "UTF-8", path])
+ except:
+ raise
+ else:
+ while True:
+ line = process.fromchild.readline().strip()
+ if not line:
+ break
+ name, value = line.split(':', 1)
+ doc.add(Field(name.strip(), value.strip(),
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+
+ exitCode = process.wait()
+ if exitCode != 0:
+ raise RuntimeError, "pdfinfo exit code %d" %(exitCode)
+
+ try:
+ process = popen2.Popen4(["pdftotext", "-enc", "UTF-8", path, "-"])
+ string = InputStreamReader(process.fromchild, 'utf-8').read()
+ except:
+ raise
+ else:
+ doc.add(Field("contents", StringReader(string)))
+ doc.add(Field("filename", os.path.abspath(path),
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ writer.addDocument(doc)
+
+ exitCode = process.wait()
+ if exitCode != 0:
+ raise RuntimeError, "pdftotext exit code %d" %(exitCode)
+
+ return doc
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/PDFHandler.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/PDFHandler.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/__init__.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/__init__.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/__init__.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/__init__.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1 @@
+# pdf package
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/__init__.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/__init__.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/PlainTextHandler.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/PlainTextHandler.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/PlainTextHandler.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/PlainTextHandler.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,46 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from lucene import Document, Field, \
+ InputStreamReader, FileInputStream, JavaError
+
+
+class PlainTextHandler(object):
+
+ def indexFile(self, writer, path):
+
+ try:
+ reader = InputStreamReader(FileInputStream(path), 'iso-8859-1')
+ except JavaError:
+ raise
+ else:
+ doc = Document()
+ doc.add(Field("contents", reader))
+ doc.add(Field("filename", os.path.abspath(path),
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ writer.addDocument(doc)
+ reader.close()
+
+ return doc
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/PlainTextHandler.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/PlainTextHandler.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/__init__.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/__init__.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/__init__.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/__init__.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1 @@
+# text package
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/__init__.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/__init__.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/Digester.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/Digester.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/Digester.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/Digester.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,76 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import xml.sax
+
+
+class Digester(xml.sax.ContentHandler):
+
+ attributes = {}
+ tags = {}
+
+ def addSetProperty(self, path, property, attribute=None):
+
+ if attribute is not None:
+ pairs = self.attributes.get(path)
+ if pairs is None:
+ self.attributes[path] = pairs = { attribute: property }
+ else:
+ pairs[property] = attribute
+
+ else:
+ self.tags[path] = property
+
+ def parse(self, input):
+
+ xml.sax.parse(input, self)
+ return self.properties
+
+ def startDocument(self):
+
+ self.properties = {}
+ self.path = []
+
+ def startElement(self, tag, attrs):
+
+ self.path.append(tag)
+ pairs = self.attributes.get('/'.join(self.path))
+ if pairs is not None:
+ for name, value in attrs.items():
+ property = pairs.get(name)
+ if property is not None:
+ self.properties[property] = value
+
+ def characters(self, data):
+
+ self.data = data.strip()
+
+ def endElement(self, tag):
+
+ if self.data:
+ property = self.tags.get('/'.join(self.path))
+ if property is not None:
+ self.properties[property] = self.data
+ self.data = None
+
+ self.path.pop()
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/Digester.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/Digester.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/DigesterXMLHandler.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/DigesterXMLHandler.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/DigesterXMLHandler.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/DigesterXMLHandler.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,75 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from lucene import Document, Field
+from lia.handlingtypes.xml.Digester import Digester
+
+
+class DigesterXMLHandler(object):
+
+ def __init__(self):
+
+ self.digester = digester = Digester()
+
+ digester.addSetProperty("address-book/contact", "type", "type")
+ digester.addSetProperty("address-book/contact/name", "name")
+ digester.addSetProperty("address-book/contact/address", "address")
+ digester.addSetProperty("address-book/contact/city", "city")
+ digester.addSetProperty("address-book/contact/province", "province")
+ digester.addSetProperty("address-book/contact/postalcode", "postalcode")
+ digester.addSetProperty("address-book/contact/country", "country")
+ digester.addSetProperty("address-book/contact/telephone", "telephone")
+
+ def indexFile(self, writer, path):
+
+ try:
+ file = open(path)
+ except IOError, e:
+ raise
+ else:
+ props = self.digester.parse(file)
+ doc = Document()
+ doc.add(Field("type", props['type'],
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ doc.add(Field("name", props['name'],
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ doc.add(Field("address", props['address'],
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ doc.add(Field("city", props['city'],
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ doc.add(Field("province", props['province'],
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ doc.add(Field("postalcode", props['postalcode'],
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ doc.add(Field("country", props['country'],
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ doc.add(Field("telephone", props['telephone'],
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ doc.add(Field("filename", os.path.abspath(path),
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ writer.addDocument(doc)
+ file.close()
+
+ return doc
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/DigesterXMLHandler.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/DigesterXMLHandler.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/__init__.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/__init__.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/__init__.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/__init__.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1 @@
+# xml package
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/__init__.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/__init__.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/BaseIndexingTestCase.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/BaseIndexingTestCase.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/BaseIndexingTestCase.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/BaseIndexingTestCase.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,86 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from unittest import TestCase
+
+from lucene import \
+ FSDirectory, System, \
+ Document, Field, SimpleAnalyzer, IndexWriter, IndexReader
+
+
+class BaseIndexingTestCase(TestCase):
+ keywords = ["1", "2"]
+ unindexed = ["Netherlands", "Italy"]
+ unstored = ["Amsterdam has lots of bridges",
+ "Venice has lots of canals"]
+ text = ["Amsterdam", "Venice"]
+
+ def setUp(self):
+
+ indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
+ 'index-dir')
+ self.dir = FSDirectory.getDirectory(indexDir, True)
+ self.addDocuments(self.dir)
+
+ def addDocuments(self, dir):
+
+ writer = IndexWriter(dir, self.getAnalyzer(), True)
+ writer.setUseCompoundFile(self.isCompound())
+
+ for i in xrange(len(self.keywords)):
+ doc = Document()
+ doc.add(Field("id", self.keywords[i],
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ doc.add(Field("country", self.unindexed[i],
+ Field.Store.YES, Field.Index.NO))
+ doc.add(Field("contents", self.unstored[i],
+ Field.Store.NO, Field.Index.TOKENIZED))
+ doc.add(Field("city", self.text[i],
+ Field.Store.YES, Field.Index.TOKENIZED))
+ writer.addDocument(doc)
+
+ writer.optimize()
+ writer.close()
+
+ def getAnalyzer(self):
+
+ return SimpleAnalyzer()
+
+ def isCompound(self):
+
+ return True
+
+ def testIndexWriter(self):
+
+ writer = IndexWriter(self.dir, self.getAnalyzer(), False)
+ self.assertEqual(len(self.keywords), writer.docCount())
+ writer.close()
+
+ def testIndexReader(self):
+
+ reader = IndexReader.open(self.dir)
+ self.assertEqual(len(self.keywords), reader.maxDoc())
+ self.assertEqual(len(self.keywords), reader.numDocs())
+ reader.close()
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/BaseIndexingTestCase.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/BaseIndexingTestCase.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/CompoundVersusMultiFileIndexTest.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/CompoundVersusMultiFileIndexTest.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/CompoundVersusMultiFileIndexTest.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/CompoundVersusMultiFileIndexTest.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,106 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from unittest import TestCase
+from time import time
+from datetime import timedelta
+
+from lucene import \
+ IndexWriter, SimpleAnalyzer, FSDirectory, System, Document, Field
+
+
+class CompoundVersusMultiFileIndexTest(TestCase):
+
+ def __init__(self, *args):
+
+ super(CompoundVersusMultiFileIndexTest, self).__init__(*args)
+ self.docs = self.loadDocuments(5000, 10)
+
+ def setUp(self):
+
+ indexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
+ "index-dir")
+
+ cIndexDir = "%s-compound" %(indexDir)
+ mIndexDir = "%s-multi" %(indexDir)
+ self.rmdir(cIndexDir)
+ self.rmdir(mIndexDir)
+
+ self.cDir = FSDirectory.getDirectory(cIndexDir, True)
+ self.mDir = FSDirectory.getDirectory(mIndexDir, True)
+
+ def rmdir(self, dir):
+
+ for dir, dirnames, filenames in os.walk(dir):
+ for filename in filenames:
+ os.remove(os.path.join(dir, filename))
+ for dirname in dirnames:
+ os.rmdir(os.path.join(dir, dirname))
+
+ def testTiming(self):
+
+ cTiming = self.timeIndexWriter(self.cDir, True)
+ mTiming = self.timeIndexWriter(self.mDir, False)
+
+ print "Compound Time :", cTiming
+ print "Multi-file Time:", mTiming
+
+ self.assert_(cTiming > mTiming)
+
+ def timeIndexWriter(self, dir, isCompound):
+
+ start = time()
+ self.addDocuments(dir, isCompound)
+
+ return timedelta(seconds=time() - start)
+
+ def addDocuments(self, dir, isCompound):
+
+ writer = IndexWriter(dir, SimpleAnalyzer(), True)
+ writer.setUseCompoundFile(isCompound)
+
+ # change to adjust performance of indexing with FSDirectory
+ # writer.mergeFactor = writer.mergeFactor
+ # writer.maxMergeDocs = writer.maxMergeDocs
+ # writer.minMergeDocs = writer.minMergeDocs
+
+ for word in self.docs:
+ doc = Document()
+ doc.add(Field("keyword", word,
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ doc.add(Field("unindexed", word,
+ Field.Store.YES, Field.Index.NO))
+ doc.add(Field("unstored", word,
+ Field.Store.NO, Field.Index.TOKENIZED))
+ doc.add(Field("text", word,
+ Field.Store.YES, Field.Index.TOKENIZED))
+ writer.addDocument(doc)
+
+ writer.optimize()
+ writer.close()
+
+ def loadDocuments(self, numDocs, wordsPerDoc):
+
+ return ["Bibamus " * wordsPerDoc] * numDocs
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/CompoundVersusMultiFileIndexTest.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/CompoundVersusMultiFileIndexTest.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentDeleteTest.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentDeleteTest.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentDeleteTest.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentDeleteTest.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,70 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+from lucene import IndexWriter, IndexReader
+from lia.indexing.BaseIndexingTestCase import BaseIndexingTestCase
+
+
+class DocumentDeleteTest(BaseIndexingTestCase):
+
+ def testDeleteBeforeIndexMerge(self):
+
+ reader = IndexReader.open(self.dir)
+ self.assertEqual(2, reader.maxDoc())
+ self.assertEqual(2, reader.numDocs())
+ reader.deleteDocument(1)
+
+ self.assert_(reader.isDeleted(1))
+ self.assert_(reader.hasDeletions())
+ self.assertEqual(2, reader.maxDoc())
+ self.assertEqual(1, reader.numDocs())
+
+ reader.close()
+
+ reader = IndexReader.open(self.dir)
+
+ self.assertEqual(2, reader.maxDoc())
+ self.assertEqual(1, reader.numDocs())
+
+ reader.close()
+
+ def testDeleteAfterIndexMerge(self):
+
+ reader = IndexReader.open(self.dir)
+ self.assertEqual(2, reader.maxDoc())
+ self.assertEqual(2, reader.numDocs())
+ reader.deleteDocument(1)
+ reader.close()
+
+ writer = IndexWriter(self.dir, self.getAnalyzer(), False)
+ writer.optimize()
+ writer.close()
+
+ reader = IndexReader.open(self.dir)
+
+ self.assert_(not reader.isDeleted(1))
+ self.assert_(not reader.hasDeletions())
+ self.assertEqual(1, reader.maxDoc())
+ self.assertEqual(1, reader.numDocs())
+
+ reader.close()
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentDeleteTest.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentDeleteTest.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentUpdateTest.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentUpdateTest.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentUpdateTest.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentUpdateTest.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,72 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+from lucene import \
+ IndexWriter, IndexReader, IndexSearcher, \
+ WhitespaceAnalyzer, Document, Field, Term, TermQuery
+
+from lia.indexing.BaseIndexingTestCase import BaseIndexingTestCase
+
+
+class DocumentUpdateTest(BaseIndexingTestCase):
+
+ def testUpdate(self):
+
+ self.assertEqual(1, self.getHitCount("city", "Amsterdam"))
+
+ reader = IndexReader.open(self.dir)
+ reader.deleteDocuments(Term("city", "Amsterdam"))
+ reader.close()
+
+ writer = IndexWriter(self.dir, self.getAnalyzer(), False)
+ doc = Document()
+ doc.add(Field("id", "1", Field.Store.YES, Field.Index.UN_TOKENIZED))
+ doc.add(Field("country", "Russia",
+ Field.Store.YES, Field.Index.NO))
+ doc.add(Field("contents", "St. Petersburg has lots of bridges",
+ Field.Store.NO, Field.Index.TOKENIZED))
+ doc.add(Field("city", "St. Petersburg",
+ Field.Store.YES, Field.Index.TOKENIZED))
+ writer.addDocument(doc)
+ writer.optimize()
+ writer.close()
+
+ self.assertEqual(0, self.getHitCount("city", "Amsterdam"))
+ self.assertEqual(1, self.getHitCount("city", "Petersburg"))
+
+
+ def getAnalyzer(self):
+
+ return WhitespaceAnalyzer()
+
+
+ def getHitCount(self, fieldName, searchString):
+
+ searcher = IndexSearcher(self.dir)
+ t = Term(fieldName, searchString)
+ query = TermQuery(t)
+ hits = searcher.search(query)
+ hitCount = hits.length()
+ searcher.close()
+
+ return hitCount
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentUpdateTest.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentUpdateTest.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FS2RAMDirectoryTest.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FS2RAMDirectoryTest.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FS2RAMDirectoryTest.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FS2RAMDirectoryTest.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,44 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from lia.indexing.BaseIndexingTestCase import BaseIndexingTestCase
+from lucene import IndexReader, RAMDirectory
+
+
+class FS2RAMDirectoryTest(BaseIndexingTestCase):
+
+ def testSlurp(self):
+
+ fsDirReader = IndexReader.open(self.dir)
+ self.assertEqual(len(self.keywords), fsDirReader.maxDoc())
+ self.assertEqual(len(self.keywords), fsDirReader.numDocs())
+
+ ramDir = RAMDirectory(self.dir)
+ ramDirReader = IndexReader.open(ramDir)
+ self.assertEqual(fsDirReader.maxDoc(), ramDirReader.maxDoc())
+ self.assertEqual(fsDirReader.numDocs(), ramDirReader.numDocs())
+
+ fsDirReader.close()
+ ramDir.close()
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FS2RAMDirectoryTest.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FS2RAMDirectoryTest.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FSversusRAMDirectoryTest.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FSversusRAMDirectoryTest.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FSversusRAMDirectoryTest.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FSversusRAMDirectoryTest.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,94 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from unittest import TestCase
+from time import time
+from datetime import timedelta
+
+from lucene import \
+ IndexWriter, SimpleAnalyzer, Document, Field, System, \
+ FSDirectory, RAMDirectory
+
+
+class FSversusRAMDirectoryTest(TestCase):
+
+ def __init__(self, *args):
+
+ super(FSversusRAMDirectoryTest, self).__init__(*args)
+ self.docs = self.loadDocuments(3000, 5)
+
+ def setUp(self):
+
+ fsIndexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
+ "fs-index")
+ self.ramDir = RAMDirectory()
+ self.fsDir = FSDirectory.getDirectory(fsIndexDir, True)
+
+ def testTiming(self):
+
+ ramTiming = self.timeIndexWriter(self.ramDir)
+ fsTiming = self.timeIndexWriter(self.fsDir)
+
+ #self.assert_(fsTiming > ramTiming)
+
+ print "RAMDirectory Time:", ramTiming
+ print "FSDirectory Time :", fsTiming
+
+ def timeIndexWriter(self, dir):
+
+ start = time()
+ self.addDocuments(dir)
+
+ return timedelta(seconds=time() - start)
+
+ def addDocuments(self, dir):
+
+ writer = IndexWriter(dir, SimpleAnalyzer(), True)
+
+ #
+ # change to adjust performance of indexing with FSDirectory
+ # writer.mergeFactor = writer.mergeFactor
+ # writer.maxMergeDocs = writer.maxMergeDocs
+ # writer.minMergeDocs = writer.minMergeDocs
+ #
+
+ for word in self.docs:
+ doc = Document()
+ doc.add(Field("keyword", word,
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ doc.add(Field("unindexed", word,
+ Field.Store.YES, Field.Index.NO))
+ doc.add(Field("unstored", word,
+ Field.Store.NO, Field.Index.TOKENIZED))
+ doc.add(Field("text", word,
+ Field.Store.YES, Field.Index.TOKENIZED))
+ writer.addDocument(doc)
+
+ writer.optimize()
+ writer.close()
+
+ def loadDocuments(self, numDocs, wordsPerDoc):
+
+ return ["Bibamus " * wordsPerDoc] * numDocs
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FSversusRAMDirectoryTest.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FSversusRAMDirectoryTest.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FieldLengthTest.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FieldLengthTest.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FieldLengthTest.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FieldLengthTest.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,86 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from unittest import TestCase
+from time import time
+from datetime import timedelta
+
+from lucene import \
+ IndexWriter, SimpleAnalyzer, Document, Field, System, \
+ Term, TermQuery, IndexSearcher, FSDirectory
+
+
+class FieldLengthTest(TestCase):
+
+ keywords = ["1", "2"]
+ unindexed = ["Netherlands", "Italy"]
+ unstored = ["Amsterdam has lots of bridges",
+ "Venice has lots of canals"]
+ text = ["Amsterdam", "Venice"]
+
+ def setUp(self):
+
+ indexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
+ "index-dir")
+ self.dir = FSDirectory.getDirectory(indexDir, True)
+
+ def testFieldSize(self):
+
+ self.addDocuments(self.dir, 10)
+ self.assertEqual(1, self.getHitCount("contents", "bridges"))
+
+ self.addDocuments(self.dir, 1)
+ self.assertEqual(0, self.getHitCount("contents", "bridges"))
+
+ def getHitCount(self, fieldName, searchString):
+
+ searcher = IndexSearcher(self.dir)
+ t = Term(fieldName, searchString)
+ query = TermQuery(t)
+ hits = searcher.search(query)
+ hitCount = hits.length()
+ searcher.close()
+
+ return hitCount
+
+ def addDocuments(self, dir, maxFieldLength):
+
+ writer = IndexWriter(dir, SimpleAnalyzer(), True)
+ writer.setMaxFieldLength(maxFieldLength)
+
+ for i in xrange(len(self.keywords)):
+ doc = Document()
+ doc.add(Field("id", self.keywords[i],
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ doc.add(Field("country", self.unindexed[i],
+ Field.Store.YES, Field.Index.NO))
+ doc.add(Field("contents", self.unstored[i],
+ Field.Store.NO, Field.Index.TOKENIZED))
+ doc.add(Field("city", self.text[i],
+ Field.Store.YES, Field.Index.TOKENIZED))
+ writer.addDocument(doc)
+
+ writer.optimize()
+ writer.close()
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FieldLengthTest.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FieldLengthTest.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/IndexTuningDemo.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/IndexTuningDemo.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/IndexTuningDemo.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/IndexTuningDemo.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,70 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from time import time
+from datetime import timedelta
+
+from lucene import \
+ IndexWriter, SimpleAnalyzer, Document, Field, Term, FSDirectory, System
+
+
+class IndexTuningDemo(object):
+
+ def main(cls, argv):
+
+ if len(argv) < 5:
+ print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>"
+ return
+
+ docsInIndex = int(argv[1])
+
+ # create an index called 'index-dir' in a temp directory
+ indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
+ 'index-dir')
+ dir = FSDirectory.getDirectory(indexDir, True)
+ analyzer = SimpleAnalyzer()
+ writer = IndexWriter(dir, analyzer, True)
+
+ # set variables that affect speed of indexing
+ writer.setMergeFactor(int(argv[2]))
+ writer.setMaxMergeDocs(int(argv[3]))
+ writer.setMaxBufferedDocs(int(argv[4]))
+ # writer.infoStream = System.out
+
+ print "Merge factor: ", writer.getMergeFactor()
+ print "Max merge docs:", writer.getMaxMergeDocs()
+ print "Max buffered docs:", writer.getMaxBufferedDocs()
+
+ start = time()
+ for i in xrange(docsInIndex):
+ doc = Document()
+ doc.add(Field("fieldname", "Bibamus",
+ Field.Store.YES, Field.Index.TOKENIZED))
+ writer.addDocument(doc)
+
+ writer.close()
+ print "Time: ", timedelta(seconds=time() - start)
+
+ main = classmethod(main)
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/IndexTuningDemo.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/IndexTuningDemo.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/LockTest.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/LockTest.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/LockTest.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/LockTest.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,73 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from unittest import TestCase
+
+from lucene import VERSION, \
+ IndexWriter, IndexReader, SimpleAnalyzer, FSDirectory, System
+
+
+class LockTest(TestCase):
+
+ def setUp(self):
+
+ indexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
+ "index")
+ self.dir = FSDirectory.getDirectory(indexDir, True)
+
+ def testWriteLock(self):
+
+ if VERSION < '2.1.0':
+ writer1 = None
+ writer2 = None
+ gotException = False
+
+ try:
+ try:
+ writer1 = IndexWriter(self.dir, SimpleAnalyzer(), True)
+ writer2 = IndexWriter(self.dir, SimpleAnalyzer(), True)
+
+ self.fail("We should never reach this point")
+ except:
+ gotException = True
+ finally:
+ writer1.close()
+ self.assert_(writer2 is None)
+ self.assert_(gotException)
+
+ def testCommitLock(self):
+
+ reader1 = None
+ reader2 = None
+
+ try:
+ writer = IndexWriter(self.dir, SimpleAnalyzer(), True)
+ writer.close()
+
+ reader1 = IndexReader.open(self.dir)
+ reader2 = IndexReader.open(self.dir)
+ finally:
+ reader1.close()
+ reader2.close()
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/LockTest.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/LockTest.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/VerboseIndexing.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/VerboseIndexing.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/VerboseIndexing.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/VerboseIndexing.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,60 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from lucene import \
+ FSDirectory, Document, Field, IndexWriter, SimpleAnalyzer, System
+
+
+class VerboseIndexing(object):
+
+ def main(cls, argv):
+
+ vi = VerboseIndexing()
+ vi.index()
+
+ def index(self):
+
+ dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
+ "verbose-index")
+ dir = FSDirectory.getDirectory(dirPath, True)
+ writer = IndexWriter(dir, SimpleAnalyzer(), True)
+
+ writer.setInfoStream(System.out)
+
+ for i in xrange(100):
+ doc = Document()
+ doc.add(Field("keyword", "goober",
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ writer.addDocument(doc)
+
+ writer.optimize()
+ writer.close()
+
+ main = classmethod(main)
+
+
+if __name__ == "__main__":
+ import sys
+ VerboseIndexing.main(sys.argv)
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/VerboseIndexing.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/VerboseIndexing.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/__init__.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/__init__.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/__init__.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/__init__.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1 @@
+# indexing package
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/__init__.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/__init__.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Indexer.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Indexer.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Indexer.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Indexer.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,99 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from time import time
+from datetime import timedelta
+from lucene import \
+ IndexWriter, StandardAnalyzer, Document, Field, \
+ InputStreamReader, FileInputStream
+
+
+class Indexer(object):
+
+ def main(cls, argv):
+
+ if len(argv) != 3:
+ print "Usage: python Indexer.py <index dir> <data dir>"
+
+ else:
+ indexDir = argv[1]
+ dataDir = argv[2]
+
+ start = time()
+ numIndexed = cls.index(indexDir, dataDir)
+ duration = timedelta(seconds=time() - start)
+
+ print "Indexing %s files took %s" %(numIndexed, duration)
+
+ def index(cls, indexDir, dataDir):
+
+ if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
+ raise IOError, "%s does not exist or is not a directory" %(dataDir)
+
+ writer = IndexWriter(indexDir, StandardAnalyzer(), True)
+ writer.setUseCompoundFile(False)
+
+ cls.indexDirectory(writer, dataDir)
+
+ numIndexed = writer.docCount()
+ writer.optimize()
+ writer.close()
+
+ return numIndexed
+
+ def indexDirectory(cls, writer, dir):
+
+ for name in os.listdir(dir):
+ path = os.path.join(dir, name)
+ if os.path.isfile(path):
+ if path.endswith('.txt'):
+ cls.indexFile(writer, path)
+ elif os.path.isdir(path):
+ cls.indexDirectory(writer, path)
+
+ def indexFile(cls, writer, path):
+
+ try:
+ reader = InputStreamReader(FileInputStream(path), 'iso-8859-1')
+ except IOError, e:
+ print 'IOError while opening %s: %s' %(path, e)
+ else:
+ print 'Indexing', path
+ doc = Document()
+ doc.add(Field("contents", reader))
+ doc.add(Field("path", os.path.abspath(path),
+ Field.Store.YES, Field.Index.UN_TOKENIZED))
+ writer.addDocument(doc)
+ reader.close()
+
+ main = classmethod(main)
+ index = classmethod(index)
+ indexDirectory = classmethod(indexDirectory)
+ indexFile = classmethod(indexFile)
+
+
+if __name__ == "__main__":
+ import sys
+ Indexer.main(sys.argv)
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Indexer.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Indexer.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Searcher.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Searcher.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Searcher.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Searcher.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1,71 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from time import time
+from datetime import timedelta
+
+from lucene import \
+ Document, IndexSearcher, FSDirectory, QueryParser, StandardAnalyzer, Hit
+
+
+class Searcher(object):
+
+ def main(cls, argv):
+
+ if len(argv) != 3:
+ print "Usage: python Searcher.py <index dir> <query>"
+
+ else:
+ indexDir = argv[1]
+ q = argv[2]
+
+ if not (os.path.exists(indexDir) and os.path.isdir(indexDir)):
+ raise IOError, "%s does not exist or is not a directory" %(indexDir)
+
+ cls.search(indexDir, q)
+
+ def search(cls, indexDir, q):
+
+ fsDir = FSDirectory.getDirectory(indexDir, False)
+ searcher = IndexSearcher(fsDir)
+
+ query = QueryParser("contents", StandardAnalyzer()).parse(q)
+ start = time()
+ hits = searcher.search(query)
+ duration = timedelta(seconds=time() - start)
+
+ print "Found %d document(s) (in %s) that matched query '%s':" %(hits.length(), duration, q)
+
+ for hit in hits:
+ doc = Hit.cast_(hit).getDocument()
+ print doc["path"]
+
+ main = classmethod(main)
+ search = classmethod(search)
+
+
+if __name__ == "__main__":
+ import sys
+ Searcher.main(sys.argv)
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Searcher.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Searcher.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/__init__.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/__init__.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/__init__.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/__init__.py Thu Jan 8 19:28:33 2009
@@ -0,0 +1 @@
+# meetlucene package
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/__init__.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/__init__.py
------------------------------------------------------------------------------
svn:mime-type = text/plain