You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@lucenenet.apache.org by sh...@ineos.com on 2012/05/14 16:21:55 UTC
Question on basic functionality
I've been trying various things to try to make the indexing faster. I've
not been able to do successful searches when I don't do an optimize and
commit after adding each document. It does return a value, but not all of
the values I'm expecting. I've tried moving the commit to the end, which
makes it a ton faster but I expect to return 4 entries and I only get one.
I'm suspecting it has to do with the index being split into segments and
they're not merged at the end. Its only indexing 2k records but its
taking around an hour on my dual core laptop. I have tried using ramdisk
already to get rid of the i/o bottleneck if there is one, but it gave
about the same result. Any help would be appreciated.
Here's the basic code I'm using to add records:
Dim dir As New Store.SimpleFSDirectory(New
DirectoryInfo("c:\test"))
Dim anlz As New
StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29)
Dim indx As New Index.IndexWriter(dir, anlz, True,
IndexWriter.MaxFieldLength.UNLIMITED)
Dim mergePolicy As MergePolicy
Dim logPolicy As LogMergePolicy
indx.SetUseCompoundFile(True)
mergePolicy = indx.GetMergePolicy
logPolicy = mergePolicy
logPolicy.SetNoCFSRatio(1)
indx.SetRAMBufferSizeMB(256)
intC = 0
Dim doc As New Documents.Document
While dbrs.EOF = False
intC = intC + 1
getNextFile
If intC = 1 Then
doc.Add(New Documents.Field("id", intC,
Documents.Field.Store.YES, Documents.Field.Index.NO))
doc.Add(New Documents.Field("path", strFile,
Documents.Field.Store.YES, Documents.Field.Index.NO))
doc.Add(New Documents.Field("body", strBody,
Documents.Field.Store.YES, Documents.Field.Index.ANALYZED))
Else
doc.GetField("id").SetValue(intC)
doc.GetField("path").SetValue(strFile)
doc.GetField("body").SetValue(strBody)
End If
indx.AddDocument(doc)
indx.Optimize(1)
indx.Commit()
End While
Here's the code I'm using to search:
Dim dir As New Store.SimpleFSDirectory(New
DirectoryInfo("c:\test"))
Dim IR As IndexReader = IndexReader.Open(dir, True)
Dim anlz As New
StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29)
Dim parser As New
QueryParsers.QueryParser(Lucene.Net.Util.Version.LUCENE_29, "body", anlz)
Dim query As Search.Query
Dim searcher As IndexSearcher
Dim resultDocs As TopDocs
Dim hits() As ScoreDoc
Dim hit As ScoreDoc
Dim score As Double
Dim i As Integer
query = parser.Parse("""I-A-05.50""")
searcher = New IndexSearcher(IR)
resultDocs = searcher.Search(query, IR.MaxDoc())
Console.WriteLine("Found " & resultDocs.TotalHits & "
results")
Dim doc As Documents.Document
hits = resultDocs.ScoreDocs
For Each hit In hits
doc = searcher.Doc(hit.Doc)
score = hit.Score
Console.WriteLine("Results num: " & i + 1 & "
score: " & score)
Console.WriteLine("ID: " & doc.Get("id"))
Console.WriteLine("Path: " & doc.Get("path"))
Next
searcher.Close()
dir.Close()
RE: Question on basic functionality
Posted by sh...@ineos.com.
It turns out my problem was user inflicted data changing, I wasn't finding
what I was looking for because my document manager changed things and
removed files it turns out.
I ended up generating an example program to demonstrate the problem I
thought I had, so I'm going to post it for posterity, perhaps it will help
someone.
/create creates an access db with 2000 documents made up of random words
with 4 of the search criteria injected randomly.
/index creates the index from the access database
/find returns the results.
Imports System.IO
Imports Lucene.Net
Imports Lucene.Net.Analysis.Standard
Imports Lucene.Net.Search
Imports Lucene.Net.Index
Module Module1
Sub Main()
Dim strCommand As String
strCommand = Command()
Select Case LCase(strCommand)
Case "/create"
Dim RNG As New Random
Dim dbc As New ADODB.Connection
Dim strWord As String
Dim strDoc As String
Dim intC As Integer
intC = 0
strWord = ""
strDoc = ""
If Directory.Exists("test") = False Then
Directory.CreateDirectory("test")
End If
If File.Exists("alltext.mdb") = False Then
Dim cat As Object
cat = CreateObject("ADOX.Catalog")
cat.Create("Provider=Microsoft.Jet.OLEDB.4.0;Data
Source=alltext.mdb")
cat = Nothing
End If
dbc.ConnectionString =
"Provider=Microsoft.Jet.OLEDB.4.0;Data Source=alltext.mdb;User
Id=admin;Password=;"
dbc.Open()
dbc.Execute("CREATE TABLE docs (ID COUNTER PRIMARY
KEY,Location Text(50),BODY memo)")
For intC = 1 To 2000
dbc.Execute("INSERT into docs(location,Body) values
('c:\" & genword(intC) & "','" & gendoc(intC, False, "") & "')")
Console.Write(Chr(8) & Chr(8) & Chr(8) & Chr(8) &
Chr(8) & Chr(8) & Chr(8) & Chr(8) & Chr(8) & Chr(8))
Console.Write(CStr(Int((intC / 2000) * 100)) & "%")
Next
For intC = 1 To 4
dbc.Execute("update docs set body ='" & gendoc(intC,
True, "I-A-05.50") & "' where id=" & RNG.Next(1, 2000))
Next
Console.WriteLine(" done")
dbc.Close()
Case "/index"
If File.Exists("alltext.mdb") = True Then
Dim dbc As New ADODB.Connection
Dim dbrs As ADODB.Recordset
Dim dbrsC As ADODB.Recordset
Dim dir As New Store.SimpleFSDirectory(New
DirectoryInfo("test"))
Dim intC As Integer
Dim intT As Integer
Dim anlz As New StandardAnalyzer(Lucene.Net.Util.
Version.LUCENE_29)
Dim indx As New Index.IndexWriter(dir, anlz, True,
IndexWriter.MaxFieldLength.UNLIMITED)
indx.SetUseCompoundFile(True)
indx.SetRAMBufferSizeMB(256)
intC = 0
Console.WriteLine("Started:" & Now())
dbc.ConnectionString =
"Provider=Microsoft.Jet.OLEDB.4.0;Data Source=alltext.mdb;User
Id=admin;Password=;"
dbc.Open()
dbrsC = dbc.Execute("select count(*) from docs")
intT = dbrsC.Fields(0).Value
dbrsC.Close()
dbrs = dbc.Execute("select * from docs")
Console.WriteLine("Docs:" & CStr(intT))
Dim doc As New Documents.Document
While dbrs.EOF = False
intC = intC + 1
If intC = 1 Then
doc.Add(New Documents.Field("name",
dbrs.Fields("id").Value, Documents.Field.Store.YES, Documents.Field.Index
.NO))
doc.Add(New Documents.Field("path",
dbrs.Fields("location").Value, Documents.Field.Store.YES, Documents.Field.
Index.NO))
doc.Add(New Documents.Field("body",
dbrs.Fields("body").Value, Documents.Field.Store.YES, Documents.Field.
Index.ANALYZED))
Else
doc.GetField("name").SetValue(dbrs.Fields("id"
).Value & "")
doc.GetField("path").SetValue(dbrs.Fields(
"location").Value & "")
doc.GetField("body").SetValue(dbrs.Fields(
"body").Value & "")
End If
indx.AddDocument(doc)
Console.Write(Chr(8) & Chr(8) & Chr(8) & Chr(8) &
Chr(8) & Chr(8) & Chr(8) & Chr(8) & Chr(8) & Chr(8))
Console.Write(CStr(Int((intC / intT) * 100)))
dbrs.MoveNext()
End While
indx.Optimize(1)
indx.Commit()
Console.WriteLine("")
Console.WriteLine("Finished:" & Now())
Else
Console.WriteLine(
"----------------------------------------------------")
Console.WriteLine("alltext.mdb doesn't exist run
/create first")
Console.WriteLine(
"----------------------------------------------------")
End If
Case "/find"
Try
If Directory.Exists("test") Then
Dim dir As New Store.SimpleFSDirectory(New
DirectoryInfo("test"))
Dim IR As IndexReader = IndexReader.Open(dir, True
)
Dim anlz As New StandardAnalyzer(Lucene.Net.Util.
Version.LUCENE_29)
Dim parser As New QueryParsers.QueryParser
(Lucene.Net.Util.Version.LUCENE_29, "body", anlz)
Dim query As Search.Query
Dim searcher As IndexSearcher
Dim resultDocs As TopDocs
Dim hits() As ScoreDoc
Dim hit As ScoreDoc
Dim score As Double
Dim i As Integer
query = parser.Parse("""I-A-05.50""")
searcher = New IndexSearcher(IR)
resultDocs = searcher.Search(query, IR.MaxDoc())
Console.WriteLine("Found " & resultDocs.TotalHits
& " results")
Dim doc As Documents.Document
hits = resultDocs.ScoreDocs
For Each hit In hits
doc = searcher.Doc(hit.Doc)
score = hit.Score
Console.WriteLine("Results num: " & i + 1 & "
score: " & score)
Console.WriteLine("ID: " & doc.Get("name"))
Console.WriteLine("Path: " & doc.Get("path"))
Next
searcher.Close()
dir.Close()
Else
Console.WriteLine(
"----------------------------------------------------")
Console.WriteLine("test folder doesn't exist run
/index")
Console.WriteLine(
"----------------------------------------------------")
End If
Catch
Console.WriteLine("An error occurred on /find:")
Console.WriteLine(
"----------------------------------------------------")
Console.WriteLine(Err.Description)
Console.WriteLine(
"----------------------------------------------------")
Console.WriteLine("You must use /create before this
will function")
End Try
Case Else
Console.WriteLine(
"----------------------------------------------------")
Console.WriteLine("/create will create a folder " &
Directory.GetCurrentDirectory & "\test and build an alltext.mdb database
with contrived search data")
Console.WriteLine("/index will create a new lucene index
from the generated data with optimize and index after all documents are
added")
Console.WriteLine("/find should return 4 results from the
index in test")
Console.WriteLine(
"----------------------------------------------------")
End Select
End Sub
Function genword(intSeed As Integer) As String
Dim RNG As New Random(intSeed)
Dim intC As Integer
Dim intC2 As Integer
Dim intL As Integer
Dim strRet As String
strRet = ""
For intL = 1 To RNG.Next(5, 10)
intC = RNG.Next(65, 90)
intC2 = RNG.Next(97, 122)
If Int(intC / 5) = intC / 5 Then
strRet = strRet & Chr(intC)
Else
strRet = strRet & Chr(intC2)
End If
Next
genword = strRet
End Function
Function gendoc(intSeed As Integer, Optional booInject As Boolean =
False, Optional strInjected As String = "") As String
Dim RNG As New Random(intSeed)
Dim intL As Integer
Dim strRet As String
strRet = ""
For intL = 1 To RNG.Next(800, 1000)
strRet = strRet & genword(intSeed + intL) & " "
Next
If booInject = True Then
Dim intF As Integer
intF = InStr(CInt(Len(strRet) / 2), strRet, " ")
strRet = Left(strRet, intF) & strInjected & " " & Mid(strRet,
intF + 1)
End If
gendoc = Trim(strRet)
End Function
End Module
RE: Question on basic functionality
Posted by sh...@ineos.com.
Yes, the attempts as speeding it up have caused the search to not work is
what is happening. So I am assuming I'm doing it wrong.
The merge policies I added in a separate attempt to make it faster after
reading a post elsewhere.
When I try moving the optimize and commit outside the while loop the
search only returns the single value, whereas when it is inside it returns
the 4 expected. I'll try it again though.
I've not changed the search function so I'm thinking it is the indexing
that is the problem. I-A-05.50 is a title that is used in reference I
expect it to be the original document and 3 references from other
documents.
Thanks,
Shane
From: "Simon Svensson" <si...@devhost.se>
To: <lu...@lucene.apache.org>
Cc: Shane Bumpurs/US/OLIGOMERS/INEOS@INEOS
Date: 05/14/2012 10:02 AM
Subject: RE: Question on basic functionality
Hi,
You describe two separate problems; indexing speed and search issues.
Have you done any cpu profiling to determine where to begin looking for
your slow indexing speed? It sounds like you're ruled out i/o bottleneck,
but it could still be a slow database you're reading from. Try simplify
your code by removing references to merge policies (the default policies
should be enough) and create new Document/Field instance instead of
reusing them. Also, move that Optimize and Commit call outside your While
loop.
Your search issues is probably due to your use of StandardAnalyzer. It
does not know the secret meaning of "I-A-05.50" (Product number? Secret
identifier?) and will tokenize that into "I" and "05.50". The "A" will be
skipped as it is a default stopword. I have to admit a lack of knowledge
regarding StandardAnalyzer's use of positional information. You're
currently searching for the phrase "I 05.50" or "I [anything] 05.50".
Could you provide some example data which you expect to match, but isn't
returned by your IndexSearcher?
// Simon
-----Original Message-----
From: shane.bumpurs@ineos.com [mailto:shane.bumpurs@ineos.com]
Sent: Monday, May 14, 2012 4:22 PM
To: lucene-net-user@lucene.apache.org
Subject: Question on basic functionality
I've been trying various things to try to make the indexing faster. I've
not been able to do successful searches when I don't do an optimize and
commit after adding each document. It does return a value, but not all of
the values I'm expecting. I've tried moving the commit to the end, which
makes it a ton faster but I expect to return 4 entries and I only get one.
I'm suspecting it has to do with the index being split into segments and
they're not merged at the end. Its only indexing 2k records but its
taking around an hour on my dual core laptop. I have tried using ramdisk
already to get rid of the i/o bottleneck if there is one, but it gave
about the same result. Any help would be appreciated.
Here's the basic code I'm using to add records:
Dim dir As New Store.SimpleFSDirectory(New
DirectoryInfo("c:\test"))
Dim anlz As New
StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29)
Dim indx As New Index.IndexWriter(dir, anlz, True,
IndexWriter.MaxFieldLength.UNLIMITED)
Dim mergePolicy As MergePolicy
Dim logPolicy As LogMergePolicy
indx.SetUseCompoundFile(True)
mergePolicy = indx.GetMergePolicy
logPolicy = mergePolicy
logPolicy.SetNoCFSRatio(1)
indx.SetRAMBufferSizeMB(256)
intC = 0
Dim doc As New Documents.Document
While dbrs.EOF = False
intC = intC + 1
getNextFile
If intC = 1 Then
doc.Add(New Documents.Field("id", intC,
Documents.Field.Store.YES, Documents.Field.Index.NO))
doc.Add(New Documents.Field("path", strFile,
Documents.Field.Store.YES, Documents.Field.Index.NO))
doc.Add(New Documents.Field("body", strBody,
Documents.Field.Store.YES, Documents.Field.Index.ANALYZED))
Else
doc.GetField("id").SetValue(intC)
doc.GetField("path").SetValue(strFile)
doc.GetField("body").SetValue(strBody)
End If
indx.AddDocument(doc)
indx.Optimize(1)
indx.Commit()
End While
Here's the code I'm using to search:
Dim dir As New Store.SimpleFSDirectory(New
DirectoryInfo("c:\test"))
Dim IR As IndexReader = IndexReader.Open(dir, True)
Dim anlz As New
StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29)
Dim parser As New
QueryParsers.QueryParser(Lucene.Net.Util.Version.LUCENE_29, "body", anlz)
Dim query As Search.Query
Dim searcher As IndexSearcher
Dim resultDocs As TopDocs
Dim hits() As ScoreDoc
Dim hit As ScoreDoc
Dim score As Double
Dim i As Integer
query = parser.Parse("""I-A-05.50""")
searcher = New IndexSearcher(IR)
resultDocs = searcher.Search(query, IR.MaxDoc())
Console.WriteLine("Found " & resultDocs.TotalHits & "
results")
Dim doc As Documents.Document
hits = resultDocs.ScoreDocs
For Each hit In hits
doc = searcher.Doc(hit.Doc)
score = hit.Score
Console.WriteLine("Results num: " & i + 1 & "
score: " & score)
Console.WriteLine("ID: " & doc.Get("id"))
Console.WriteLine("Path: " & doc.Get("path"))
Next
searcher.Close()
dir.Close()
RE: Question on basic functionality
Posted by Simon Svensson <si...@devhost.se>.
Hi,
You describe two separate problems; indexing speed and search issues.
Have you done any cpu profiling to determine where to begin looking for your slow indexing speed? It sounds like you're ruled out i/o bottleneck, but it could still be a slow database you're reading from. Try simplify your code by removing references to merge policies (the default policies should be enough) and create new Document/Field instance instead of reusing them. Also, move that Optimize and Commit call outside your While loop.
Your search issues is probably due to your use of StandardAnalyzer. It does not know the secret meaning of "I-A-05.50" (Product number? Secret identifier?) and will tokenize that into "I" and "05.50". The "A" will be skipped as it is a default stopword. I have to admit a lack of knowledge regarding StandardAnalyzer's use of positional information. You're currently searching for the phrase "I 05.50" or "I [anything] 05.50".
Could you provide some example data which you expect to match, but isn't returned by your IndexSearcher?
// Simon
-----Original Message-----
From: shane.bumpurs@ineos.com [mailto:shane.bumpurs@ineos.com]
Sent: Monday, May 14, 2012 4:22 PM
To: lucene-net-user@lucene.apache.org
Subject: Question on basic functionality
I've been trying various things to try to make the indexing faster. I've not been able to do successful searches when I don't do an optimize and commit after adding each document. It does return a value, but not all of the values I'm expecting. I've tried moving the commit to the end, which makes it a ton faster but I expect to return 4 entries and I only get one.
I'm suspecting it has to do with the index being split into segments and they're not merged at the end. Its only indexing 2k records but its taking around an hour on my dual core laptop. I have tried using ramdisk already to get rid of the i/o bottleneck if there is one, but it gave about the same result. Any help would be appreciated.
Here's the basic code I'm using to add records:
Dim dir As New Store.SimpleFSDirectory(New
DirectoryInfo("c:\test"))
Dim anlz As New
StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29)
Dim indx As New Index.IndexWriter(dir, anlz, True,
IndexWriter.MaxFieldLength.UNLIMITED)
Dim mergePolicy As MergePolicy
Dim logPolicy As LogMergePolicy
indx.SetUseCompoundFile(True)
mergePolicy = indx.GetMergePolicy
logPolicy = mergePolicy
logPolicy.SetNoCFSRatio(1)
indx.SetRAMBufferSizeMB(256)
intC = 0
Dim doc As New Documents.Document
While dbrs.EOF = False
intC = intC + 1
getNextFile
If intC = 1 Then
doc.Add(New Documents.Field("id", intC, Documents.Field.Store.YES, Documents.Field.Index.NO))
doc.Add(New Documents.Field("path", strFile, Documents.Field.Store.YES, Documents.Field.Index.NO))
doc.Add(New Documents.Field("body", strBody, Documents.Field.Store.YES, Documents.Field.Index.ANALYZED))
Else
doc.GetField("id").SetValue(intC)
doc.GetField("path").SetValue(strFile)
doc.GetField("body").SetValue(strBody)
End If
indx.AddDocument(doc)
indx.Optimize(1)
indx.Commit()
End While
Here's the code I'm using to search:
Dim dir As New Store.SimpleFSDirectory(New
DirectoryInfo("c:\test"))
Dim IR As IndexReader = IndexReader.Open(dir, True)
Dim anlz As New
StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29)
Dim parser As New
QueryParsers.QueryParser(Lucene.Net.Util.Version.LUCENE_29, "body", anlz)
Dim query As Search.Query
Dim searcher As IndexSearcher
Dim resultDocs As TopDocs
Dim hits() As ScoreDoc
Dim hit As ScoreDoc
Dim score As Double
Dim i As Integer
query = parser.Parse("""I-A-05.50""")
searcher = New IndexSearcher(IR)
resultDocs = searcher.Search(query, IR.MaxDoc())
Console.WriteLine("Found " & resultDocs.TotalHits & "
results")
Dim doc As Documents.Document
hits = resultDocs.ScoreDocs
For Each hit In hits
doc = searcher.Doc(hit.Doc)
score = hit.Score
Console.WriteLine("Results num: " & i + 1 & "
score: " & score)
Console.WriteLine("ID: " & doc.Get("id"))
Console.WriteLine("Path: " & doc.Get("path"))
Next
searcher.Close()
dir.Close()