You are viewing a plain text version of this content. The canonical link for it is here.

Posted to solr-user@lucene.apache.org by Furkan KAMACI <fu...@gmail.com> on 2013/04/26 11:30:12 UTC

Document is missing mandatory uniqueKey field: id for Solr PDF indexing

I use Solr 4.2.1 and these are my fields:

<field name="id" type="string" indexed="true" stored="true" required="true"
multiValued="false" />
<field name="text" type="text_general" indexed="true" stored="true"/>


<!-- Common metadata fields, named specifically to match up with
SolrCell metadata when parsing rich documents such as Word, PDF.
Some fields are multiValued only because Tika currently may return
multiple values for them. Some metadata is parsed from the documents,
but there are some which come from the client context:
"content_type": From the HTTP headers of incoming stream
"resourcename": From SolrCell request param resource.name
-->
<field name="title" type="text_general" indexed="true" stored="true"
multiValued="true"/>
<field name="subject" type="text_general" indexed="true" stored="true"/>
<field name="description" type="text_general" indexed="true" stored="true"/>
<field name="comments" type="text_general" indexed="true" stored="true"/>
<field name="author" type="text_general" indexed="true" stored="true"/>
<field name="keywords" type="text_general" indexed="true" stored="true"/>
<field name="category" type="text_general" indexed="true" stored="true"/>
<field name="resourcename" type="text_general" indexed="true"
stored="true"/>
<field name="url" type="text_general" indexed="true" stored="true"/>
<field name="content_type" type="string" indexed="true" stored="true"
multiValued="true"/>
<field name="last_modified" type="date" indexed="true" stored="true"/>
<field name="links" type="string" indexed="true" stored="true"
multiValued="true"/>

<!-- Main body of document extracted by SolrCell.
NOTE: This field is not indexed by default, since it is also copied to
"text"
using copyField below. This is to save space. Use this field for returning
and
highlighting document content. Use the "text" field to search the content.
-->
<field name="content" type="text_general" indexed="false" stored="true"
multiValued="true"/>


<!-- catchall field, containing all other searchable text fields
(implemented
via copyField further on in this schema -->
<!--
<field name="text" type="text_general" indexed="true" stored="false"
multiValued="true"/>
-->
<!-- catchall text field that indexes tokens both normally and in reverse
for efficient
leading wildcard queries. -->
<field name="text_rev" type="text_general_rev" indexed="true"
stored="false" multiValued="true"/>

<!-- non-tokenized version of manufacturer to make it easier to sort or
group
results by manufacturer. copied from "manu" via copyField -->
<field name="manu_exact" type="string" indexed="true" stored="false"/>

<field name="payloads" type="payloads" indexed="true" stored="true"/>

<field name="_version_" type="long" indexed="true" stored="true"/>

I run that command:

java -Durl=http://localhost:8983/solr/update/extract -jar post.jar
523387.pdf

However I get that error, any ideas?

Apr 26, 2013 12:26:51 PM org.apache.solr.common.SolrException log
SEVERE: org.apache.solr.common.SolrException: Document is missing mandatory
uniqueKey field: id
at
org.apache.solr.update.AddUpdateCommand.getIndexedId(AddUpdateCommand.java:88)
at
org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:464)
at
org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:346)
at
org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
at
org.apache.solr.handler.extraction.ExtractingDocumentLoader.doAdd(ExtractingDocumentLoader.java:121)
at
org.apache.solr.handler.extraction.ExtractingDocumentLoader.addDoc(ExtractingDocumentLoader.java:126)
at
org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:228)
at
org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
at
org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:1817)
at
org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:639)
at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:345)
at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:141)
at
org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
at
org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
at
org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
at
org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
at
org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
at
org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
at
org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
at
org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
at
org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
at
org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
at org.eclipse.jetty.server.Server.handle(Server.java:365)
at
org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
at
org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
at
org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
at
org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
at
org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
at
org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
at
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
at
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
at java.lang.Thread.run(Thread.java:722)

Re: Document is missing mandatory uniqueKey field: id for Solr PDF indexing

Posted by Furkan KAMACI <fu...@gmail.com>.

OK, I asked another question for it and it has solved, thanks.

2013/4/26 Furkan KAMACI <fu...@gmail.com>

> Jack, thanks for your answers. Ok, when I remove -Durl parameter I think
> it works, thanks. However I think that I have a problem with my schema. I
> get that error:
>
> Apr 26, 2013 3:52:21 PM org.apache.solr.common.SolrException log
> SEVERE: org.apache.solr.common.SolrException: ERROR:
> [doc=/home/ll/Desktop/b/lucene-solr-lucene_solr_4_2_1/solr/example/exampledocs/523387.pdf]
> multiple values encountered for non multiValued copy field text:
> application/pdf
>
>
> 2013/4/26 Jack Krupansky <ja...@basetechnology.com>
>
>> Maybe you are confusing things by mixing instructions - there are
>> SEPARATE instructions for directly using SolrCell and implicitly using it
>> via post.jar. Pick which you want and stick with it. DO NOT MIX the
>> instructions.
>>
>> You wrote: " I run that command: java -Durl=
>> http://localhost:8983/solr/update/extract -jar post.jar 523387.pdf"
>>
>> Was there a GOOD reason that you chose that URL?
>>
>> Best to stay with what the post.jar wiki recommends:
>>
>> Post all CSV, XML, JSON and PDF documents using AUTO mode which detects
>> type based on file name:
>>
>> java -Dauto -jar post.jar *.csv *.xml *.json *.pdf
>>
>> Or, stick with SolrCell directly, but follow its distinct instructions:
>> http://wiki.apache.org/solr/ExtractingRequestHandler
>>
>> Again, DO NOT MIX the instructions from the two.
>>
>> post.jar is designed so that you do not need to know or care exactly how
>> rich document indexing works.
>>
>> -- Jack Krupansky
>>
>> -----Original Message----- From: Furkan KAMACI
>> Sent: Friday, April 26, 2013 5:30 AM
>> To: solr-user@lucene.apache.org
>> Subject: Document is missing mandatory uniqueKey field: id for Solr PDF
>> indexing
>>
>>
>> I use Solr 4.2.1 and these are my fields:
>>
>> <field name="id" type="string" indexed="true" stored="true"
>> required="true"
>> multiValued="false" />
>> <field name="text" type="text_general" indexed="true" stored="true"/>
>>
>>
>> <!-- Common metadata fields, named specifically to match up with
>> SolrCell metadata when parsing rich documents such as Word, PDF.
>> Some fields are multiValued only because Tika currently may return
>> multiple values for them. Some metadata is parsed from the documents,
>> but there are some which come from the client context:
>> "content_type": From the HTTP headers of incoming stream
>> "resourcename": From SolrCell request param resource.name
>> -->
>> <field name="title" type="text_general" indexed="true" stored="true"
>> multiValued="true"/>
>> <field name="subject" type="text_general" indexed="true" stored="true"/>
>> <field name="description" type="text_general" indexed="true"
>> stored="true"/>
>> <field name="comments" type="text_general" indexed="true" stored="true"/>
>> <field name="author" type="text_general" indexed="true" stored="true"/>
>> <field name="keywords" type="text_general" indexed="true" stored="true"/>
>> <field name="category" type="text_general" indexed="true" stored="true"/>
>> <field name="resourcename" type="text_general" indexed="true"
>> stored="true"/>
>> <field name="url" type="text_general" indexed="true" stored="true"/>
>> <field name="content_type" type="string" indexed="true" stored="true"
>> multiValued="true"/>
>> <field name="last_modified" type="date" indexed="true" stored="true"/>
>> <field name="links" type="string" indexed="true" stored="true"
>> multiValued="true"/>
>>
>> <!-- Main body of document extracted by SolrCell.
>> NOTE: This field is not indexed by default, since it is also copied to
>> "text"
>> using copyField below. This is to save space. Use this field for returning
>> and
>> highlighting document content. Use the "text" field to search the content.
>> -->
>> <field name="content" type="text_general" indexed="false" stored="true"
>> multiValued="true"/>
>>
>>
>> <!-- catchall field, containing all other searchable text fields
>> (implemented
>> via copyField further on in this schema -->
>> <!--
>> <field name="text" type="text_general" indexed="true" stored="false"
>> multiValued="true"/>
>> -->
>> <!-- catchall text field that indexes tokens both normally and in reverse
>> for efficient
>> leading wildcard queries. -->
>> <field name="text_rev" type="text_general_rev" indexed="true"
>> stored="false" multiValued="true"/>
>>
>> <!-- non-tokenized version of manufacturer to make it easier to sort or
>> group
>> results by manufacturer. copied from "manu" via copyField -->
>> <field name="manu_exact" type="string" indexed="true" stored="false"/>
>>
>> <field name="payloads" type="payloads" indexed="true" stored="true"/>
>>
>> <field name="_version_" type="long" indexed="true" stored="true"/>
>>
>> I run that command:
>>
>> java -Durl=http://localhost:8983/solr/update/extract -jar post.jar
>> 523387.pdf
>>
>> However I get that error, any ideas?
>>
>> Apr 26, 2013 12:26:51 PM org.apache.solr.common.SolrException log
>> SEVERE: org.apache.solr.common.SolrException: Document is missing
>> mandatory
>> uniqueKey field: id
>> at
>>
>> org.apache.solr.update.AddUpdateCommand.getIndexedId(AddUpdateCommand.java:88)
>> at
>>
>> org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:464)
>> at
>>
>> org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:346)
>> at
>>
>> org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
>> at
>>
>> org.apache.solr.handler.extraction.ExtractingDocumentLoader.doAdd(ExtractingDocumentLoader.java:121)
>> at
>>
>> org.apache.solr.handler.extraction.ExtractingDocumentLoader.addDoc(ExtractingDocumentLoader.java:126)
>> at
>>
>> org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:228)
>> at
>>
>> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
>> at
>>
>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
>> at org.apache.solr.core.SolrCore.execute(SolrCore.java:1817)
>> at
>>
>> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:639)
>> at
>>
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:345)
>> at
>>
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:141)
>> at
>>
>> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
>> at
>> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
>> at
>>
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
>> at
>>
>> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
>> at
>>
>> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
>> at
>>
>> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
>> at
>> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
>> at
>>
>> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
>> at
>>
>> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
>> at
>>
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
>> at
>>
>> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
>> at
>>
>> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
>> at
>>
>> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
>> at org.eclipse.jetty.server.Server.handle(Server.java:365)
>> at
>>
>> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
>> at
>>
>> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
>> at
>>
>> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
>> at
>>
>> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
>> at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
>> at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
>> at
>>
>> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
>> at
>>
>> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
>> at
>>
>> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
>> at
>>
>> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
>> at java.lang.Thread.run(Thread.java:722)
>>
>
>

Re: Document is missing mandatory uniqueKey field: id for Solr PDF indexing

Posted by Furkan KAMACI <fu...@gmail.com>.

Jack, thanks for your answers. Ok, when I remove -Durl parameter I think it
works, thanks. However I think that I have a problem with my schema. I get
that error:

Apr 26, 2013 3:52:21 PM org.apache.solr.common.SolrException log
SEVERE: org.apache.solr.common.SolrException: ERROR:
[doc=/home/ll/Desktop/b/lucene-solr-lucene_solr_4_2_1/solr/example/exampledocs/523387.pdf]
multiple values encountered for non multiValued copy field text:
application/pdf


2013/4/26 Jack Krupansky <ja...@basetechnology.com>

> Maybe you are confusing things by mixing instructions - there are SEPARATE
> instructions for directly using SolrCell and implicitly using it via
> post.jar. Pick which you want and stick with it. DO NOT MIX the
> instructions.
>
> You wrote: " I run that command: java -Durl=
> http://localhost:8983/solr/update/extract -jar post.jar 523387.pdf"
>
> Was there a GOOD reason that you chose that URL?
>
> Best to stay with what the post.jar wiki recommends:
>
> Post all CSV, XML, JSON and PDF documents using AUTO mode which detects
> type based on file name:
>
> java -Dauto -jar post.jar *.csv *.xml *.json *.pdf
>
> Or, stick with SolrCell directly, but follow its distinct instructions:
> http://wiki.apache.org/solr/ExtractingRequestHandler
>
> Again, DO NOT MIX the instructions from the two.
>
> post.jar is designed so that you do not need to know or care exactly how
> rich document indexing works.
>
> -- Jack Krupansky
>
> -----Original Message----- From: Furkan KAMACI
> Sent: Friday, April 26, 2013 5:30 AM
> To: solr-user@lucene.apache.org
> Subject: Document is missing mandatory uniqueKey field: id for Solr PDF
> indexing
>
>
> I use Solr 4.2.1 and these are my fields:
>
> <field name="id" type="string" indexed="true" stored="true" required="true"
> multiValued="false" />
> <field name="text" type="text_general" indexed="true" stored="true"/>
>
>
> <!-- Common metadata fields, named specifically to match up with
> SolrCell metadata when parsing rich documents such as Word, PDF.
> Some fields are multiValued only because Tika currently may return
> multiple values for them. Some metadata is parsed from the documents,
> but there are some which come from the client context:
> "content_type": From the HTTP headers of incoming stream
> "resourcename": From SolrCell request param resource.name
> -->
> <field name="title" type="text_general" indexed="true" stored="true"
> multiValued="true"/>
> <field name="subject" type="text_general" indexed="true" stored="true"/>
> <field name="description" type="text_general" indexed="true"
> stored="true"/>
> <field name="comments" type="text_general" indexed="true" stored="true"/>
> <field name="author" type="text_general" indexed="true" stored="true"/>
> <field name="keywords" type="text_general" indexed="true" stored="true"/>
> <field name="category" type="text_general" indexed="true" stored="true"/>
> <field name="resourcename" type="text_general" indexed="true"
> stored="true"/>
> <field name="url" type="text_general" indexed="true" stored="true"/>
> <field name="content_type" type="string" indexed="true" stored="true"
> multiValued="true"/>
> <field name="last_modified" type="date" indexed="true" stored="true"/>
> <field name="links" type="string" indexed="true" stored="true"
> multiValued="true"/>
>
> <!-- Main body of document extracted by SolrCell.
> NOTE: This field is not indexed by default, since it is also copied to
> "text"
> using copyField below. This is to save space. Use this field for returning
> and
> highlighting document content. Use the "text" field to search the content.
> -->
> <field name="content" type="text_general" indexed="false" stored="true"
> multiValued="true"/>
>
>
> <!-- catchall field, containing all other searchable text fields
> (implemented
> via copyField further on in this schema -->
> <!--
> <field name="text" type="text_general" indexed="true" stored="false"
> multiValued="true"/>
> -->
> <!-- catchall text field that indexes tokens both normally and in reverse
> for efficient
> leading wildcard queries. -->
> <field name="text_rev" type="text_general_rev" indexed="true"
> stored="false" multiValued="true"/>
>
> <!-- non-tokenized version of manufacturer to make it easier to sort or
> group
> results by manufacturer. copied from "manu" via copyField -->
> <field name="manu_exact" type="string" indexed="true" stored="false"/>
>
> <field name="payloads" type="payloads" indexed="true" stored="true"/>
>
> <field name="_version_" type="long" indexed="true" stored="true"/>
>
> I run that command:
>
> java -Durl=http://localhost:8983/solr/update/extract -jar post.jar
> 523387.pdf
>
> However I get that error, any ideas?
>
> Apr 26, 2013 12:26:51 PM org.apache.solr.common.SolrException log
> SEVERE: org.apache.solr.common.SolrException: Document is missing mandatory
> uniqueKey field: id
> at
>
> org.apache.solr.update.AddUpdateCommand.getIndexedId(AddUpdateCommand.java:88)
> at
>
> org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:464)
> at
>
> org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:346)
> at
>
> org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
> at
>
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.doAdd(ExtractingDocumentLoader.java:121)
> at
>
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.addDoc(ExtractingDocumentLoader.java:126)
> at
>
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:228)
> at
>
> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
> at
>
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
> at org.apache.solr.core.SolrCore.execute(SolrCore.java:1817)
> at
>
> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:639)
> at
>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:345)
> at
>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:141)
> at
>
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
> at
> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
> at
>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
> at
> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
> at
>
> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
> at
>
> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
> at
> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
> at
>
> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
> at
>
> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
> at
>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
> at
>
> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
> at
>
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
> at
>
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
> at org.eclipse.jetty.server.Server.handle(Server.java:365)
> at
>
> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
> at
>
> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
> at
>
> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
> at
>
> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
> at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
> at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
> at
>
> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
> at
>
> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
> at
>
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
> at
>
> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
> at java.lang.Thread.run(Thread.java:722)
>

Re: Document is missing mandatory uniqueKey field: id for Solr PDF indexing

Posted by Jack Krupansky <ja...@basetechnology.com>.

Maybe you are confusing things by mixing instructions - there are SEPARATE 
instructions for directly using SolrCell and implicitly using it via 
post.jar. Pick which you want and stick with it. DO NOT MIX the 
instructions.

You wrote: " I run that command: 
java -Durl=http://localhost:8983/solr/update/extract -jar post.jar 
523387.pdf"

Was there a GOOD reason that you chose that URL?

Best to stay with what the post.jar wiki recommends:

Post all CSV, XML, JSON and PDF documents using AUTO mode which detects type 
based on file name:

java -Dauto -jar post.jar *.csv *.xml *.json *.pdf

Or, stick with SolrCell directly, but follow its distinct instructions:
http://wiki.apache.org/solr/ExtractingRequestHandler

Again, DO NOT MIX the instructions from the two.

post.jar is designed so that you do not need to know or care exactly how 
rich document indexing works.

-- Jack Krupansky

-----Original Message----- 
From: Furkan KAMACI
Sent: Friday, April 26, 2013 5:30 AM
To: solr-user@lucene.apache.org
Subject: Document is missing mandatory uniqueKey field: id for Solr PDF 
indexing

I use Solr 4.2.1 and these are my fields:

<field name="id" type="string" indexed="true" stored="true" required="true"
multiValued="false" />
<field name="text" type="text_general" indexed="true" stored="true"/>


<!-- Common metadata fields, named specifically to match up with
SolrCell metadata when parsing rich documents such as Word, PDF.
Some fields are multiValued only because Tika currently may return
multiple values for them. Some metadata is parsed from the documents,
but there are some which come from the client context:
"content_type": From the HTTP headers of incoming stream
"resourcename": From SolrCell request param resource.name
-->
<field name="title" type="text_general" indexed="true" stored="true"
multiValued="true"/>
<field name="subject" type="text_general" indexed="true" stored="true"/>
<field name="description" type="text_general" indexed="true" stored="true"/>
<field name="comments" type="text_general" indexed="true" stored="true"/>
<field name="author" type="text_general" indexed="true" stored="true"/>
<field name="keywords" type="text_general" indexed="true" stored="true"/>
<field name="category" type="text_general" indexed="true" stored="true"/>
<field name="resourcename" type="text_general" indexed="true"
stored="true"/>
<field name="url" type="text_general" indexed="true" stored="true"/>
<field name="content_type" type="string" indexed="true" stored="true"
multiValued="true"/>
<field name="last_modified" type="date" indexed="true" stored="true"/>
<field name="links" type="string" indexed="true" stored="true"
multiValued="true"/>

<!-- Main body of document extracted by SolrCell.
NOTE: This field is not indexed by default, since it is also copied to
"text"
using copyField below. This is to save space. Use this field for returning
and
highlighting document content. Use the "text" field to search the content.
-->
<field name="content" type="text_general" indexed="false" stored="true"
multiValued="true"/>


<!-- catchall field, containing all other searchable text fields
(implemented
via copyField further on in this schema -->
<!--
<field name="text" type="text_general" indexed="true" stored="false"
multiValued="true"/>
-->
<!-- catchall text field that indexes tokens both normally and in reverse
for efficient
leading wildcard queries. -->
<field name="text_rev" type="text_general_rev" indexed="true"
stored="false" multiValued="true"/>

<!-- non-tokenized version of manufacturer to make it easier to sort or
group
results by manufacturer. copied from "manu" via copyField -->
<field name="manu_exact" type="string" indexed="true" stored="false"/>

<field name="payloads" type="payloads" indexed="true" stored="true"/>

<field name="_version_" type="long" indexed="true" stored="true"/>

I run that command:

java -Durl=http://localhost:8983/solr/update/extract -jar post.jar
523387.pdf

However I get that error, any ideas?

Apr 26, 2013 12:26:51 PM org.apache.solr.common.SolrException log
SEVERE: org.apache.solr.common.SolrException: Document is missing mandatory
uniqueKey field: id
at
org.apache.solr.update.AddUpdateCommand.getIndexedId(AddUpdateCommand.java:88)
at
org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:464)
at
org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:346)
at
org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
at
org.apache.solr.handler.extraction.ExtractingDocumentLoader.doAdd(ExtractingDocumentLoader.java:121)
at
org.apache.solr.handler.extraction.ExtractingDocumentLoader.addDoc(ExtractingDocumentLoader.java:126)
at
org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:228)
at
org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
at
org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:1817)
at
org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:639)
at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:345)
at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:141)
at
org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
at
org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
at
org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
at
org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
at
org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
at
org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
at
org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
at
org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
at
org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
at
org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
at org.eclipse.jetty.server.Server.handle(Server.java:365)
at
org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
at
org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
at
org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
at
org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
at
org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
at
org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
at
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
at
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
at java.lang.Thread.run(Thread.java:722)

Re: Document is missing mandatory uniqueKey field: id for Solr PDF indexing

Posted by Furkan KAMACI <fu...@gmail.com>.

I think that I should start a new thread for my question to help people who
searches for same situation.

2013/4/26 Furkan KAMACI <fu...@gmail.com>

> If you can help me it would be nice. I get that error:
>
> SimplePostTool version 1.5
> Posting files to base url http://localhost:8983/solr/update/extract..
> Entering auto mode. File endings considered are
> xml,json,csv,pdf,doc,docx,ppt,pptx,xls,xlsx,odt,odp,ods,ott,otp,ots,rtf,htm,html,txt,log
> POSTing file 523387.pdf (application/pdf)
> SimplePostTool: WARNING: Solr returned an error #404 Not Found
> SimplePostTool: WARNING: IOException while reading response:
> java.io.FileNotFoundException:
> http://localhost:8983/solr/update/extract/extract?resource.name=%2Fhome%2Fll%2FDesktop%2Fb%2Flucene-solr-lucene_solr_4_2_1%2Fsolr%2Fexample%2Fexampledocs%2F523387.pdf&literal.id=%2Fhome%2Fll%2FDesktop%2Fb%2Flucene-solr-lucene_solr_4_2_1%2Fsolr%2Fexample%2Fexampledocs%2F523387.pdf
> 1 files indexed.
> COMMITting Solr index changes to http://localhost:8983/solr/update/extract
> ..
> Disconnected from the target VM, address: '127.0.0.1:58385', transport:
> 'socket'
> Time spent: 0:00:00.194
>
> and there is nothing indexed. Here is my server log:
>
> Apr 26, 2013 2:55:58 PM org.apache.solr.update.DirectUpdateHandler2 commit
> INFO: start
> commit{,optimize=false,openSearcher=true,waitSearcher=true,expungeDeletes=false,softCommit=false,prepareCommit=false}
> Apr 26, 2013 2:55:58 PM org.apache.solr.core.SolrDeletionPolicy onCommit
> INFO: SolrDeletionPolicy.onCommit: commits:num=2
>  commit{dir=NRTCachingDirectory(org.apache.lucene.store.MMapDirectory@/home/ll/Desktop/b/lucene-solr-lucene_solr_4_2_1/solr/example/solr/collection1/data/index
> lockFactory=org.apache.lucene.store.NativeFSLockFactory@386b8592;
> maxCacheMB=48.0
> maxMergeSizeMB=4.0),segFN=segments_c,generation=12,filenames=[segments_c]
>  commit{dir=NRTCachingDirectory(org.apache.lucene.store.MMapDirectory@/home/ll/Desktop/b/lucene-solr-lucene_solr_4_2_1/solr/example/solr/collection1/data/index
> lockFactory=org.apache.lucene.store.NativeFSLockFactory@386b8592;
> maxCacheMB=48.0
> maxMergeSizeMB=4.0),segFN=segments_d,generation=13,filenames=[segments_d]
> Apr 26, 2013 2:55:58 PM org.apache.solr.core.SolrDeletionPolicy
> updateCommits
> INFO: newest commit = 13[segments_d]
> Apr 26, 2013 2:55:58 PM org.apache.solr.search.SolrIndexSearcher <init>
> INFO: Opening Searcher@37342445 main
> Apr 26, 2013 2:55:58 PM org.apache.solr.update.DirectUpdateHandler2 commit
> INFO: end_commit_flush
> Apr 26, 2013 2:55:58 PM org.apache.solr.core.QuerySenderListener
> newSearcher
> INFO: QuerySenderListener sending requests to Searcher@37342445main{StandardDirectoryReader(segments_2:1:nrt)}
> Apr 26, 2013 2:55:58 PM org.apache.solr.core.QuerySenderListener
> newSearcher
> INFO: QuerySenderListener done.
> Apr 26, 2013 2:55:58 PM org.apache.solr.core.SolrCore registerSearcher
> INFO: [collection1] Registered new searcher Searcher@37342445main{StandardDirectoryReader(segments_2:1:nrt)}
> Apr 26, 2013 2:55:58 PM
> org.apache.solr.update.processor.LogUpdateProcessor finish
> INFO: [collection1] webapp=/solr path=/update/extract params={commit=true}
> {commit=} 0 156
>
>
>
>
>
> 2013/4/26 Jan Høydahl <ja...@cominvent.com>
>
>> http://wiki.apache.org/solr/post.jar
>>
>> --
>> Jan Høydahl, search solution architect
>> Cominvent AS - www.cominvent.com
>> Solr Training - www.solrtraining.com
>>
>> 26. apr. 2013 kl. 13:28 skrev Furkan KAMACI <fu...@gmail.com>:
>>
>> > Hi Raymond;
>> >
>> > Now I get that error: SimplePostTool: WARNING: IOException while reading
>> > response: java.io.FileNotFoundException:
>> >
>> > 2013/4/26 Raymond Wiker <rw...@gmail.com>
>> >
>> >> You could start by doing
>> >>
>> >> java post.jar -help
>> >>
>> >> --- the 7th example shows exactly what you need to do to add a
>> document id.
>> >>
>> >> On Fri, Apr 26, 2013 at 11:30 AM, Furkan KAMACI <
>> furkankamaci@gmail.com
>> >>> wrote:
>> >>
>> >>> I use Solr 4.2.1 and these are my fields:
>> >>>
>> >>> <field name="id" type="string" indexed="true" stored="true"
>> >> required="true"
>> >>> multiValued="false" />
>> >>> <field name="text" type="text_general" indexed="true" stored="true"/>
>> >>>
>> >>>
>> >>> <!-- Common metadata fields, named specifically to match up with
>> >>> SolrCell metadata when parsing rich documents such as Word, PDF.
>> >>> Some fields are multiValued only because Tika currently may return
>> >>> multiple values for them. Some metadata is parsed from the documents,
>> >>> but there are some which come from the client context:
>> >>> "content_type": From the HTTP headers of incoming stream
>> >>> "resourcename": From SolrCell request param resource.name
>> >>> -->
>> >>> <field name="title" type="text_general" indexed="true" stored="true"
>> >>> multiValued="true"/>
>> >>> <field name="subject" type="text_general" indexed="true"
>> stored="true"/>
>> >>> <field name="description" type="text_general" indexed="true"
>> >>> stored="true"/>
>> >>> <field name="comments" type="text_general" indexed="true"
>> stored="true"/>
>> >>> <field name="author" type="text_general" indexed="true"
>> stored="true"/>
>> >>> <field name="keywords" type="text_general" indexed="true"
>> stored="true"/>
>> >>> <field name="category" type="text_general" indexed="true"
>> stored="true"/>
>> >>> <field name="resourcename" type="text_general" indexed="true"
>> >>> stored="true"/>
>> >>> <field name="url" type="text_general" indexed="true" stored="true"/>
>> >>> <field name="content_type" type="string" indexed="true" stored="true"
>> >>> multiValued="true"/>
>> >>> <field name="last_modified" type="date" indexed="true" stored="true"/>
>> >>> <field name="links" type="string" indexed="true" stored="true"
>> >>> multiValued="true"/>
>> >>>
>> >>> <!-- Main body of document extracted by SolrCell.
>> >>> NOTE: This field is not indexed by default, since it is also copied to
>> >>> "text"
>> >>> using copyField below. This is to save space. Use this field for
>> >> returning
>> >>> and
>> >>> highlighting document content. Use the "text" field to search the
>> >> content.
>> >>> -->
>> >>> <field name="content" type="text_general" indexed="false"
>> stored="true"
>> >>> multiValued="true"/>
>> >>>
>> >>>
>> >>> <!-- catchall field, containing all other searchable text fields
>> >>> (implemented
>> >>> via copyField further on in this schema -->
>> >>> <!--
>> >>> <field name="text" type="text_general" indexed="true" stored="false"
>> >>> multiValued="true"/>
>> >>> -->
>> >>> <!-- catchall text field that indexes tokens both normally and in
>> reverse
>> >>> for efficient
>> >>> leading wildcard queries. -->
>> >>> <field name="text_rev" type="text_general_rev" indexed="true"
>> >>> stored="false" multiValued="true"/>
>> >>>
>> >>> <!-- non-tokenized version of manufacturer to make it easier to sort
>> or
>> >>> group
>> >>> results by manufacturer. copied from "manu" via copyField -->
>> >>> <field name="manu_exact" type="string" indexed="true" stored="false"/>
>> >>>
>> >>> <field name="payloads" type="payloads" indexed="true" stored="true"/>
>> >>>
>> >>> <field name="_version_" type="long" indexed="true" stored="true"/>
>> >>>
>> >>> I run that command:
>> >>>
>> >>> java -Durl=http://localhost:8983/solr/update/extract -jar post.jar
>> >>> 523387.pdf
>> >>>
>> >>> However I get that error, any ideas?
>> >>>
>> >>> Apr 26, 2013 12:26:51 PM org.apache.solr.common.SolrException log
>> >>> SEVERE: org.apache.solr.common.SolrException: Document is missing
>> >> mandatory
>> >>> uniqueKey field: id
>> >>> at
>> >>>
>> >>>
>> >>
>> org.apache.solr.update.AddUpdateCommand.getIndexedId(AddUpdateCommand.java:88)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:464)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:346)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.apache.solr.handler.extraction.ExtractingDocumentLoader.doAdd(ExtractingDocumentLoader.java:121)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.apache.solr.handler.extraction.ExtractingDocumentLoader.addDoc(ExtractingDocumentLoader.java:126)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:228)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
>> >>> at org.apache.solr.core.SolrCore.execute(SolrCore.java:1817)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:639)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:345)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:141)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
>> >>> at
>> >>>
>> >>
>> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
>> >>> at
>> >>>
>> >>
>> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
>> >>> at
>> >>>
>> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
>> >>> at org.eclipse.jetty.server.Server.handle(Server.java:365)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
>> >>> at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
>> >>> at
>> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
>> >>> at
>> >>>
>> >>>
>> >>
>> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
>> >>> at java.lang.Thread.run(Thread.java:722)
>> >>>
>> >>
>>
>>
>

Re: Document is missing mandatory uniqueKey field: id for Solr PDF indexing

Posted by Furkan KAMACI <fu...@gmail.com>.

If you can help me it would be nice. I get that error:

SimplePostTool version 1.5
Posting files to base url http://localhost:8983/solr/update/extract..
Entering auto mode. File endings considered are
xml,json,csv,pdf,doc,docx,ppt,pptx,xls,xlsx,odt,odp,ods,ott,otp,ots,rtf,htm,html,txt,log
POSTing file 523387.pdf (application/pdf)
SimplePostTool: WARNING: Solr returned an error #404 Not Found
SimplePostTool: WARNING: IOException while reading response:
java.io.FileNotFoundException:
http://localhost:8983/solr/update/extract/extract?resource.name=%2Fhome%2Fll%2FDesktop%2Fb%2Flucene-solr-lucene_solr_4_2_1%2Fsolr%2Fexample%2Fexampledocs%2F523387.pdf&literal.id=%2Fhome%2Fll%2FDesktop%2Fb%2Flucene-solr-lucene_solr_4_2_1%2Fsolr%2Fexample%2Fexampledocs%2F523387.pdf
1 files indexed.
COMMITting Solr index changes to http://localhost:8983/solr/update/extract..
Disconnected from the target VM, address: '127.0.0.1:58385', transport:
'socket'
Time spent: 0:00:00.194

and there is nothing indexed. Here is my server log:

Apr 26, 2013 2:55:58 PM org.apache.solr.update.DirectUpdateHandler2 commit
INFO: start
commit{,optimize=false,openSearcher=true,waitSearcher=true,expungeDeletes=false,softCommit=false,prepareCommit=false}
Apr 26, 2013 2:55:58 PM org.apache.solr.core.SolrDeletionPolicy onCommit
INFO: SolrDeletionPolicy.onCommit: commits:num=2
 commit{dir=NRTCachingDirectory(org.apache.lucene.store.MMapDirectory@/home/ll/Desktop/b/lucene-solr-lucene_solr_4_2_1/solr/example/solr/collection1/data/index
lockFactory=org.apache.lucene.store.NativeFSLockFactory@386b8592;
maxCacheMB=48.0
maxMergeSizeMB=4.0),segFN=segments_c,generation=12,filenames=[segments_c]
 commit{dir=NRTCachingDirectory(org.apache.lucene.store.MMapDirectory@/home/ll/Desktop/b/lucene-solr-lucene_solr_4_2_1/solr/example/solr/collection1/data/index
lockFactory=org.apache.lucene.store.NativeFSLockFactory@386b8592;
maxCacheMB=48.0
maxMergeSizeMB=4.0),segFN=segments_d,generation=13,filenames=[segments_d]
Apr 26, 2013 2:55:58 PM org.apache.solr.core.SolrDeletionPolicy
updateCommits
INFO: newest commit = 13[segments_d]
Apr 26, 2013 2:55:58 PM org.apache.solr.search.SolrIndexSearcher <init>
INFO: Opening Searcher@37342445 main
Apr 26, 2013 2:55:58 PM org.apache.solr.update.DirectUpdateHandler2 commit
INFO: end_commit_flush
Apr 26, 2013 2:55:58 PM org.apache.solr.core.QuerySenderListener newSearcher
INFO: QuerySenderListener sending requests to
Searcher@37342445main{StandardDirectoryReader(segments_2:1:nrt)}
Apr 26, 2013 2:55:58 PM org.apache.solr.core.QuerySenderListener newSearcher
INFO: QuerySenderListener done.
Apr 26, 2013 2:55:58 PM org.apache.solr.core.SolrCore registerSearcher
INFO: [collection1] Registered new searcher
Searcher@37342445main{StandardDirectoryReader(segments_2:1:nrt)}
Apr 26, 2013 2:55:58 PM org.apache.solr.update.processor.LogUpdateProcessor
finish
INFO: [collection1] webapp=/solr path=/update/extract params={commit=true}
{commit=} 0 156





2013/4/26 Jan Høydahl <ja...@cominvent.com>

> http://wiki.apache.org/solr/post.jar
>
> --
> Jan Høydahl, search solution architect
> Cominvent AS - www.cominvent.com
> Solr Training - www.solrtraining.com
>
> 26. apr. 2013 kl. 13:28 skrev Furkan KAMACI <fu...@gmail.com>:
>
> > Hi Raymond;
> >
> > Now I get that error: SimplePostTool: WARNING: IOException while reading
> > response: java.io.FileNotFoundException:
> >
> > 2013/4/26 Raymond Wiker <rw...@gmail.com>
> >
> >> You could start by doing
> >>
> >> java post.jar -help
> >>
> >> --- the 7th example shows exactly what you need to do to add a document
> id.
> >>
> >> On Fri, Apr 26, 2013 at 11:30 AM, Furkan KAMACI <furkankamaci@gmail.com
> >>> wrote:
> >>
> >>> I use Solr 4.2.1 and these are my fields:
> >>>
> >>> <field name="id" type="string" indexed="true" stored="true"
> >> required="true"
> >>> multiValued="false" />
> >>> <field name="text" type="text_general" indexed="true" stored="true"/>
> >>>
> >>>
> >>> <!-- Common metadata fields, named specifically to match up with
> >>> SolrCell metadata when parsing rich documents such as Word, PDF.
> >>> Some fields are multiValued only because Tika currently may return
> >>> multiple values for them. Some metadata is parsed from the documents,
> >>> but there are some which come from the client context:
> >>> "content_type": From the HTTP headers of incoming stream
> >>> "resourcename": From SolrCell request param resource.name
> >>> -->
> >>> <field name="title" type="text_general" indexed="true" stored="true"
> >>> multiValued="true"/>
> >>> <field name="subject" type="text_general" indexed="true"
> stored="true"/>
> >>> <field name="description" type="text_general" indexed="true"
> >>> stored="true"/>
> >>> <field name="comments" type="text_general" indexed="true"
> stored="true"/>
> >>> <field name="author" type="text_general" indexed="true" stored="true"/>
> >>> <field name="keywords" type="text_general" indexed="true"
> stored="true"/>
> >>> <field name="category" type="text_general" indexed="true"
> stored="true"/>
> >>> <field name="resourcename" type="text_general" indexed="true"
> >>> stored="true"/>
> >>> <field name="url" type="text_general" indexed="true" stored="true"/>
> >>> <field name="content_type" type="string" indexed="true" stored="true"
> >>> multiValued="true"/>
> >>> <field name="last_modified" type="date" indexed="true" stored="true"/>
> >>> <field name="links" type="string" indexed="true" stored="true"
> >>> multiValued="true"/>
> >>>
> >>> <!-- Main body of document extracted by SolrCell.
> >>> NOTE: This field is not indexed by default, since it is also copied to
> >>> "text"
> >>> using copyField below. This is to save space. Use this field for
> >> returning
> >>> and
> >>> highlighting document content. Use the "text" field to search the
> >> content.
> >>> -->
> >>> <field name="content" type="text_general" indexed="false" stored="true"
> >>> multiValued="true"/>
> >>>
> >>>
> >>> <!-- catchall field, containing all other searchable text fields
> >>> (implemented
> >>> via copyField further on in this schema -->
> >>> <!--
> >>> <field name="text" type="text_general" indexed="true" stored="false"
> >>> multiValued="true"/>
> >>> -->
> >>> <!-- catchall text field that indexes tokens both normally and in
> reverse
> >>> for efficient
> >>> leading wildcard queries. -->
> >>> <field name="text_rev" type="text_general_rev" indexed="true"
> >>> stored="false" multiValued="true"/>
> >>>
> >>> <!-- non-tokenized version of manufacturer to make it easier to sort or
> >>> group
> >>> results by manufacturer. copied from "manu" via copyField -->
> >>> <field name="manu_exact" type="string" indexed="true" stored="false"/>
> >>>
> >>> <field name="payloads" type="payloads" indexed="true" stored="true"/>
> >>>
> >>> <field name="_version_" type="long" indexed="true" stored="true"/>
> >>>
> >>> I run that command:
> >>>
> >>> java -Durl=http://localhost:8983/solr/update/extract -jar post.jar
> >>> 523387.pdf
> >>>
> >>> However I get that error, any ideas?
> >>>
> >>> Apr 26, 2013 12:26:51 PM org.apache.solr.common.SolrException log
> >>> SEVERE: org.apache.solr.common.SolrException: Document is missing
> >> mandatory
> >>> uniqueKey field: id
> >>> at
> >>>
> >>>
> >>
> org.apache.solr.update.AddUpdateCommand.getIndexedId(AddUpdateCommand.java:88)
> >>> at
> >>>
> >>>
> >>
> org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:464)
> >>> at
> >>>
> >>>
> >>
> org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:346)
> >>> at
> >>>
> >>>
> >>
> org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
> >>> at
> >>>
> >>>
> >>
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.doAdd(ExtractingDocumentLoader.java:121)
> >>> at
> >>>
> >>>
> >>
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.addDoc(ExtractingDocumentLoader.java:126)
> >>> at
> >>>
> >>>
> >>
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:228)
> >>> at
> >>>
> >>>
> >>
> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
> >>> at
> >>>
> >>>
> >>
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
> >>> at org.apache.solr.core.SolrCore.execute(SolrCore.java:1817)
> >>> at
> >>>
> >>>
> >>
> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:639)
> >>> at
> >>>
> >>>
> >>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:345)
> >>> at
> >>>
> >>>
> >>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:141)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
> >>> at
> >>>
> >>
> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
> >>> at
> >>>
> >>
> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
> >>> at
> >>>
> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
> >>> at org.eclipse.jetty.server.Server.handle(Server.java:365)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
> >>> at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
> >>> at
> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
> >>> at
> >>>
> >>>
> >>
> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
> >>> at java.lang.Thread.run(Thread.java:722)
> >>>
> >>
>
>

Re: Document is missing mandatory uniqueKey field: id for Solr PDF indexing

Posted by Jan Høydahl <ja...@cominvent.com>.

http://wiki.apache.org/solr/post.jar

--
Jan Høydahl, search solution architect
Cominvent AS - www.cominvent.com
Solr Training - www.solrtraining.com

26. apr. 2013 kl. 13:28 skrev Furkan KAMACI <fu...@gmail.com>:

> Hi Raymond;
> 
> Now I get that error: SimplePostTool: WARNING: IOException while reading
> response: java.io.FileNotFoundException:
> 
> 2013/4/26 Raymond Wiker <rw...@gmail.com>
> 
>> You could start by doing
>> 
>> java post.jar -help
>> 
>> --- the 7th example shows exactly what you need to do to add a document id.
>> 
>> On Fri, Apr 26, 2013 at 11:30 AM, Furkan KAMACI <furkankamaci@gmail.com
>>> wrote:
>> 
>>> I use Solr 4.2.1 and these are my fields:
>>> 
>>> <field name="id" type="string" indexed="true" stored="true"
>> required="true"
>>> multiValued="false" />
>>> <field name="text" type="text_general" indexed="true" stored="true"/>
>>> 
>>> 
>>> <!-- Common metadata fields, named specifically to match up with
>>> SolrCell metadata when parsing rich documents such as Word, PDF.
>>> Some fields are multiValued only because Tika currently may return
>>> multiple values for them. Some metadata is parsed from the documents,
>>> but there are some which come from the client context:
>>> "content_type": From the HTTP headers of incoming stream
>>> "resourcename": From SolrCell request param resource.name
>>> -->
>>> <field name="title" type="text_general" indexed="true" stored="true"
>>> multiValued="true"/>
>>> <field name="subject" type="text_general" indexed="true" stored="true"/>
>>> <field name="description" type="text_general" indexed="true"
>>> stored="true"/>
>>> <field name="comments" type="text_general" indexed="true" stored="true"/>
>>> <field name="author" type="text_general" indexed="true" stored="true"/>
>>> <field name="keywords" type="text_general" indexed="true" stored="true"/>
>>> <field name="category" type="text_general" indexed="true" stored="true"/>
>>> <field name="resourcename" type="text_general" indexed="true"
>>> stored="true"/>
>>> <field name="url" type="text_general" indexed="true" stored="true"/>
>>> <field name="content_type" type="string" indexed="true" stored="true"
>>> multiValued="true"/>
>>> <field name="last_modified" type="date" indexed="true" stored="true"/>
>>> <field name="links" type="string" indexed="true" stored="true"
>>> multiValued="true"/>
>>> 
>>> <!-- Main body of document extracted by SolrCell.
>>> NOTE: This field is not indexed by default, since it is also copied to
>>> "text"
>>> using copyField below. This is to save space. Use this field for
>> returning
>>> and
>>> highlighting document content. Use the "text" field to search the
>> content.
>>> -->
>>> <field name="content" type="text_general" indexed="false" stored="true"
>>> multiValued="true"/>
>>> 
>>> 
>>> <!-- catchall field, containing all other searchable text fields
>>> (implemented
>>> via copyField further on in this schema -->
>>> <!--
>>> <field name="text" type="text_general" indexed="true" stored="false"
>>> multiValued="true"/>
>>> -->
>>> <!-- catchall text field that indexes tokens both normally and in reverse
>>> for efficient
>>> leading wildcard queries. -->
>>> <field name="text_rev" type="text_general_rev" indexed="true"
>>> stored="false" multiValued="true"/>
>>> 
>>> <!-- non-tokenized version of manufacturer to make it easier to sort or
>>> group
>>> results by manufacturer. copied from "manu" via copyField -->
>>> <field name="manu_exact" type="string" indexed="true" stored="false"/>
>>> 
>>> <field name="payloads" type="payloads" indexed="true" stored="true"/>
>>> 
>>> <field name="_version_" type="long" indexed="true" stored="true"/>
>>> 
>>> I run that command:
>>> 
>>> java -Durl=http://localhost:8983/solr/update/extract -jar post.jar
>>> 523387.pdf
>>> 
>>> However I get that error, any ideas?
>>> 
>>> Apr 26, 2013 12:26:51 PM org.apache.solr.common.SolrException log
>>> SEVERE: org.apache.solr.common.SolrException: Document is missing
>> mandatory
>>> uniqueKey field: id
>>> at
>>> 
>>> 
>> org.apache.solr.update.AddUpdateCommand.getIndexedId(AddUpdateCommand.java:88)
>>> at
>>> 
>>> 
>> org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:464)
>>> at
>>> 
>>> 
>> org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:346)
>>> at
>>> 
>>> 
>> org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
>>> at
>>> 
>>> 
>> org.apache.solr.handler.extraction.ExtractingDocumentLoader.doAdd(ExtractingDocumentLoader.java:121)
>>> at
>>> 
>>> 
>> org.apache.solr.handler.extraction.ExtractingDocumentLoader.addDoc(ExtractingDocumentLoader.java:126)
>>> at
>>> 
>>> 
>> org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:228)
>>> at
>>> 
>>> 
>> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
>>> at
>>> 
>>> 
>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
>>> at org.apache.solr.core.SolrCore.execute(SolrCore.java:1817)
>>> at
>>> 
>>> 
>> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:639)
>>> at
>>> 
>>> 
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:345)
>>> at
>>> 
>>> 
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:141)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
>>> at
>>> 
>> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
>>> at
>>> 
>> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
>>> at
>>> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
>>> at org.eclipse.jetty.server.Server.handle(Server.java:365)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
>>> at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
>>> at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
>>> at
>>> 
>>> 
>> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
>>> at java.lang.Thread.run(Thread.java:722)
>>> 
>>

Re: Document is missing mandatory uniqueKey field: id for Solr PDF indexing

Posted by Furkan KAMACI <fu...@gmail.com>.

Hi Raymond;

Now I get that error: SimplePostTool: WARNING: IOException while reading
response: java.io.FileNotFoundException:

2013/4/26 Raymond Wiker <rw...@gmail.com>

> You could start by doing
>
> java post.jar -help
>
> --- the 7th example shows exactly what you need to do to add a document id.
>
> On Fri, Apr 26, 2013 at 11:30 AM, Furkan KAMACI <furkankamaci@gmail.com
> >wrote:
>
> > I use Solr 4.2.1 and these are my fields:
> >
> > <field name="id" type="string" indexed="true" stored="true"
> required="true"
> > multiValued="false" />
> > <field name="text" type="text_general" indexed="true" stored="true"/>
> >
> >
> > <!-- Common metadata fields, named specifically to match up with
> > SolrCell metadata when parsing rich documents such as Word, PDF.
> > Some fields are multiValued only because Tika currently may return
> > multiple values for them. Some metadata is parsed from the documents,
> > but there are some which come from the client context:
> > "content_type": From the HTTP headers of incoming stream
> > "resourcename": From SolrCell request param resource.name
> > -->
> > <field name="title" type="text_general" indexed="true" stored="true"
> > multiValued="true"/>
> > <field name="subject" type="text_general" indexed="true" stored="true"/>
> > <field name="description" type="text_general" indexed="true"
> > stored="true"/>
> > <field name="comments" type="text_general" indexed="true" stored="true"/>
> > <field name="author" type="text_general" indexed="true" stored="true"/>
> > <field name="keywords" type="text_general" indexed="true" stored="true"/>
> > <field name="category" type="text_general" indexed="true" stored="true"/>
> > <field name="resourcename" type="text_general" indexed="true"
> > stored="true"/>
> > <field name="url" type="text_general" indexed="true" stored="true"/>
> > <field name="content_type" type="string" indexed="true" stored="true"
> > multiValued="true"/>
> > <field name="last_modified" type="date" indexed="true" stored="true"/>
> > <field name="links" type="string" indexed="true" stored="true"
> > multiValued="true"/>
> >
> > <!-- Main body of document extracted by SolrCell.
> > NOTE: This field is not indexed by default, since it is also copied to
> > "text"
> > using copyField below. This is to save space. Use this field for
> returning
> > and
> > highlighting document content. Use the "text" field to search the
> content.
> > -->
> > <field name="content" type="text_general" indexed="false" stored="true"
> > multiValued="true"/>
> >
> >
> > <!-- catchall field, containing all other searchable text fields
> > (implemented
> > via copyField further on in this schema -->
> > <!--
> > <field name="text" type="text_general" indexed="true" stored="false"
> > multiValued="true"/>
> > -->
> > <!-- catchall text field that indexes tokens both normally and in reverse
> > for efficient
> > leading wildcard queries. -->
> > <field name="text_rev" type="text_general_rev" indexed="true"
> > stored="false" multiValued="true"/>
> >
> > <!-- non-tokenized version of manufacturer to make it easier to sort or
> > group
> > results by manufacturer. copied from "manu" via copyField -->
> > <field name="manu_exact" type="string" indexed="true" stored="false"/>
> >
> > <field name="payloads" type="payloads" indexed="true" stored="true"/>
> >
> > <field name="_version_" type="long" indexed="true" stored="true"/>
> >
> > I run that command:
> >
> > java -Durl=http://localhost:8983/solr/update/extract -jar post.jar
> > 523387.pdf
> >
> > However I get that error, any ideas?
> >
> > Apr 26, 2013 12:26:51 PM org.apache.solr.common.SolrException log
> > SEVERE: org.apache.solr.common.SolrException: Document is missing
> mandatory
> > uniqueKey field: id
> > at
> >
> >
> org.apache.solr.update.AddUpdateCommand.getIndexedId(AddUpdateCommand.java:88)
> > at
> >
> >
> org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:464)
> > at
> >
> >
> org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:346)
> > at
> >
> >
> org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
> > at
> >
> >
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.doAdd(ExtractingDocumentLoader.java:121)
> > at
> >
> >
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.addDoc(ExtractingDocumentLoader.java:126)
> > at
> >
> >
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:228)
> > at
> >
> >
> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
> > at
> >
> >
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
> > at org.apache.solr.core.SolrCore.execute(SolrCore.java:1817)
> > at
> >
> >
> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:639)
> > at
> >
> >
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:345)
> > at
> >
> >
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:141)
> > at
> >
> >
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
> > at
> >
> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
> > at
> >
> >
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
> > at
> >
> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
> > at
> >
> >
> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
> > at
> >
> >
> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
> > at
> > org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
> > at
> >
> >
> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
> > at
> >
> >
> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
> > at
> >
> >
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
> > at
> >
> >
> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
> > at
> >
> >
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
> > at
> >
> >
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
> > at org.eclipse.jetty.server.Server.handle(Server.java:365)
> > at
> >
> >
> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
> > at
> >
> >
> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
> > at
> >
> >
> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
> > at
> >
> >
> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
> > at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
> > at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
> > at
> >
> >
> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
> > at
> >
> >
> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
> > at
> >
> >
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
> > at
> >
> >
> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
> > at java.lang.Thread.run(Thread.java:722)
> >
>

Re: Document is missing mandatory uniqueKey field: id for Solr PDF indexing

Posted by Raymond Wiker <rw...@gmail.com>.

You could start by doing

java post.jar -help

--- the 7th example shows exactly what you need to do to add a document id.

On Fri, Apr 26, 2013 at 11:30 AM, Furkan KAMACI <fu...@gmail.com>wrote:

> I use Solr 4.2.1 and these are my fields:
>
> <field name="id" type="string" indexed="true" stored="true" required="true"
> multiValued="false" />
> <field name="text" type="text_general" indexed="true" stored="true"/>
>
>
> <!-- Common metadata fields, named specifically to match up with
> SolrCell metadata when parsing rich documents such as Word, PDF.
> Some fields are multiValued only because Tika currently may return
> multiple values for them. Some metadata is parsed from the documents,
> but there are some which come from the client context:
> "content_type": From the HTTP headers of incoming stream
> "resourcename": From SolrCell request param resource.name
> -->
> <field name="title" type="text_general" indexed="true" stored="true"
> multiValued="true"/>
> <field name="subject" type="text_general" indexed="true" stored="true"/>
> <field name="description" type="text_general" indexed="true"
> stored="true"/>
> <field name="comments" type="text_general" indexed="true" stored="true"/>
> <field name="author" type="text_general" indexed="true" stored="true"/>
> <field name="keywords" type="text_general" indexed="true" stored="true"/>
> <field name="category" type="text_general" indexed="true" stored="true"/>
> <field name="resourcename" type="text_general" indexed="true"
> stored="true"/>
> <field name="url" type="text_general" indexed="true" stored="true"/>
> <field name="content_type" type="string" indexed="true" stored="true"
> multiValued="true"/>
> <field name="last_modified" type="date" indexed="true" stored="true"/>
> <field name="links" type="string" indexed="true" stored="true"
> multiValued="true"/>
>
> <!-- Main body of document extracted by SolrCell.
> NOTE: This field is not indexed by default, since it is also copied to
> "text"
> using copyField below. This is to save space. Use this field for returning
> and
> highlighting document content. Use the "text" field to search the content.
> -->
> <field name="content" type="text_general" indexed="false" stored="true"
> multiValued="true"/>
>
>
> <!-- catchall field, containing all other searchable text fields
> (implemented
> via copyField further on in this schema -->
> <!--
> <field name="text" type="text_general" indexed="true" stored="false"
> multiValued="true"/>
> -->
> <!-- catchall text field that indexes tokens both normally and in reverse
> for efficient
> leading wildcard queries. -->
> <field name="text_rev" type="text_general_rev" indexed="true"
> stored="false" multiValued="true"/>
>
> <!-- non-tokenized version of manufacturer to make it easier to sort or
> group
> results by manufacturer. copied from "manu" via copyField -->
> <field name="manu_exact" type="string" indexed="true" stored="false"/>
>
> <field name="payloads" type="payloads" indexed="true" stored="true"/>
>
> <field name="_version_" type="long" indexed="true" stored="true"/>
>
> I run that command:
>
> java -Durl=http://localhost:8983/solr/update/extract -jar post.jar
> 523387.pdf
>
> However I get that error, any ideas?
>
> Apr 26, 2013 12:26:51 PM org.apache.solr.common.SolrException log
> SEVERE: org.apache.solr.common.SolrException: Document is missing mandatory
> uniqueKey field: id
> at
>
> org.apache.solr.update.AddUpdateCommand.getIndexedId(AddUpdateCommand.java:88)
> at
>
> org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:464)
> at
>
> org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:346)
> at
>
> org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
> at
>
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.doAdd(ExtractingDocumentLoader.java:121)
> at
>
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.addDoc(ExtractingDocumentLoader.java:126)
> at
>
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:228)
> at
>
> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
> at
>
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
> at org.apache.solr.core.SolrCore.execute(SolrCore.java:1817)
> at
>
> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:639)
> at
>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:345)
> at
>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:141)
> at
>
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
> at
> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
> at
>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
> at
> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
> at
>
> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
> at
>
> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
> at
> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
> at
>
> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
> at
>
> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
> at
>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
> at
>
> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
> at
>
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
> at
>
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
> at org.eclipse.jetty.server.Server.handle(Server.java:365)
> at
>
> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
> at
>
> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
> at
>
> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
> at
>
> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
> at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
> at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
> at
>
> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
> at
>
> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
> at
>
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
> at
>
> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
> at java.lang.Thread.run(Thread.java:722)
>