You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by Andreas Owen <ao...@conx.ch> on 2013/09/26 12:17:09 UTC

XPathEntityProcessor nested in TikaEntityProcessor query null exception

i'm using solr 4.3.1 and the dataimporter. i am trying to use XPathEntityProcessor within the TikaEntityProcessor for indexing html-pages but i'm getting this error for each document. i have also tried dataField="tika.text" and dataField="text" to no avail. the nested XPathEntityProcessor "detail" creates the error, the rest works fine. what am i doing wrong?

error:

ERROR - 2013-09-26 12:08:49.006; org.apache.solr.handler.dataimport.SqlEntityProcessor; The query failed 'null'
java.lang.ClassCastException: java.io.StringReader cannot be cast to java.util.Iterator
	at org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
	at org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
	at org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
	at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
	at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
	at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
	at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
	at org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
	at org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
	at org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
	at org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
	at org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
	at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
	at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
	at org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
	at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
	at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
	at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
	at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
	at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
	at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
	at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
	at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
	at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
	at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
	at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
	at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
	at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
	at org.eclipse.jetty.server.Server.handle(Server.java:365)
	at org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
	at org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
	at org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
	at org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
	at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
	at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
	at org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
	at org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
	at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
	at org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
	at java.lang.Thread.run(Unknown Source)
ERROR - 2013-09-26 12:08:49.022; org.apache.solr.common.SolrException; Exception in entity : detail:org.apache.solr.handler.dataimport.DataImportHandlerException: java.lang.ClassCastException: java.io.StringReader cannot be cast to java.util.Iterator
	at org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:65)
	at org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
	at org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
	at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
	at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
	at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
	at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
	at org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
	at org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
	at org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
	at org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
	at org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
	at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
	at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
	at org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
	at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
	at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
	at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
	at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
	at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
	at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
	at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
	at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
	at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
	at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
	at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
	at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
	at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
	at org.eclipse.jetty.server.Server.handle(Server.java:365)
	at org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
	at org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
	at org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
	at org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
	at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
	at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
	at org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
	at org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
	at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
	at org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.lang.ClassCastException: java.io.StringReader cannot be cast to java.util.Iterator
	at org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
	... 41 more



data-config.xml

<dataConfig>
	<dataSource type="BinURLDataSource" name="dataFile"/>
	<dataSource type="BinURLDataSource" name="dataUrl"/>
	<dataSource type="URLDataSource" name="main"/>
	<dataSource type="FieldReaderDataSource" name="fld"/>
<document>
<entity name="rec" processor="XPathEntityProcessor" url="file:///C:\ColdFusion10\cfusion\solr\solr\tkbintranet\docImportUrl.xml" forEach="/docs/doc" dataSource="main"> 
		<field column="title" xpath="//title" />
		<field column="id" xpath="//id" />
		<field column="file" xpath="//file" />
		<field column="url" xpath="//url" />
		<field column="urlParse" xpath="//urlParse" />
		<field column="last_modified" xpath="//last_modified" />
		<field column="Author" xpath="//author" />
		
		<entity name="tika" processor="TikaEntityProcessor" url="${rec.urlParse}" dataSource="dataUrl" onError="skip" format="html">
			<field column="text"/>
			
			<entity name="detail" type="XPathEntityProcessor" forEach="/html" dataSource="fld" dataField="${tika.text}" rootEntity="true" onError="skip">
				<field xpath="//h1" column="h_1" />				
			</entity>
		</entity>
	</entity>
</document>
</dataConfig>

Re: XPathEntityProcessor nested in TikaEntityProcessor query null exception

Posted by Andreas Owen <ao...@conx.ch>.
thanks but the first suggestion is already implemented and the 2. didn't work. i have also tried htmlMapper="identity" but nothing worked.

i also tried this but the html was stripped in both fields

<entity name="tika" processor="TikaEntityProcessor" url="${rec.urlParse}" dataSource="dataUrl" onError="skip" htmlMapper="identity" format="html" transformer="HTMLStripTransformer">
			<field column="text" name="text" stripHTML="false" />
			<field column="text" name="text_nohtml" stripHTML="true" />

but in the end i think it's best to cut tika out because i'm not getting any benefits from it. i would just need to get this to work:

	<field xpath="//h:h1" column="h_1" />
	<field column="text" xpath="/xhtml:html/xhtml:body" />

the fields are empty and i'm not getting any errors in the logs.


On 28. Sep 2013, at 2:43 AM, Alexandre Rafalovitch wrote:

> This is a rather complicated example to chew through, but try the following
> two things:
> *) dataField="${tika.text}"  => dataField="text" (or less likely htmlMapper
> tika.text)
> You might be trying to read content of the field rather than passing
> reference to the field that seems to be expected. This might explain the
> exception.
> 
> *) It may help to be aware of
> https://issues.apache.org/jira/browse/SOLR-4530 . There is a new
> htmlMapper="identity" flag on Tika entries to ensure more of HTML structure
> passing through. By default, Tika strips out most of the HTML tags.
> 
> Regards,
>   Alex.
> 
> On Thu, Sep 26, 2013 at 5:17 PM, Andreas Owen <ao...@conx.ch> wrote:
> 
>>                <entity name="tika" processor="TikaEntityProcessor"
>> url="${rec.urlParse}" dataSource="dataUrl" onError="skip" format="html">
>>                        <field column="text"/>
>> 
>>                        <entity name="detail" type="XPathEntityProcessor"
>> forEach="/html" dataSource="fld" dataField="${tika.text}" rootEntity="true"
>> onError="skip">
>>                                <field xpath="//h1" column="h_1" />
>>                        </entity>
>>                </entity>
>> 
> 
> 
> 
> Personal website: http://www.outerthoughts.com/
> LinkedIn: http://www.linkedin.com/in/alexandrerafalovitch
> - Time is the quality of nature that keeps events from happening all at
> once. Lately, it doesn't seem to be working.  (Anonymous  - via GTD book)


Re: XPathEntityProcessor nested in TikaEntityProcessor query null exception

Posted by Alexandre Rafalovitch <ar...@gmail.com>.
This is a rather complicated example to chew through, but try the following
two things:
*) dataField="${tika.text}"  => dataField="text" (or less likely htmlMapper
tika.text)
You might be trying to read content of the field rather than passing
reference to the field that seems to be expected. This might explain the
exception.

*) It may help to be aware of
https://issues.apache.org/jira/browse/SOLR-4530 . There is a new
htmlMapper="identity" flag on Tika entries to ensure more of HTML structure
passing through. By default, Tika strips out most of the HTML tags.

Regards,
   Alex.

On Thu, Sep 26, 2013 at 5:17 PM, Andreas Owen <ao...@conx.ch> wrote:

>                 <entity name="tika" processor="TikaEntityProcessor"
> url="${rec.urlParse}" dataSource="dataUrl" onError="skip" format="html">
>                         <field column="text"/>
>
>                         <entity name="detail" type="XPathEntityProcessor"
> forEach="/html" dataSource="fld" dataField="${tika.text}" rootEntity="true"
> onError="skip">
>                                 <field xpath="//h1" column="h_1" />
>                         </entity>
>                 </entity>
>



Personal website: http://www.outerthoughts.com/
LinkedIn: http://www.linkedin.com/in/alexandrerafalovitch
- Time is the quality of nature that keeps events from happening all at
once. Lately, it doesn't seem to be working.  (Anonymous  - via GTD book)

Re: XPathEntityProcessor nested in TikaEntityProcessor query null exception

Posted by Andreas Owen <ao...@conx.ch>.
i'm already using URLDataSource

On 30. Sep 2013, at 5:41 PM, P Williams wrote:

> Hi Andreas,
> 
> When using XPathEntityProcessor<http://wiki.apache.org/solr/DataImportHandler#XPathEntityProcessor>your
> DataSource
> must be of type DataSource<Reader>.  You shouldn't be using
> BinURLDataSource, it's giving you the cast exception.  Use
> URLDataSource<https://builds.apache.org/job/Solr-Artifacts-4.x/javadoc/solr-dataimporthandler/org/apache/solr/handler/dataimport/URLDataSource.html>
> or
> FileDataSource<https://builds.apache.org/job/Solr-Artifacts-4.x/javadoc/solr-dataimporthandler/org/apache/solr/handler/dataimport/FileDataSource.html>instead.
> 
> I don't think you need to specify namespaces, at least you didn't used to.
> The other thing that I've noticed is that the anywhere xpath expression //
> doesn't always work in DIH.  You might have to be more specific.
> 
> Cheers,
> Tricia
> 
> 
> 
> 
> 
> On Sun, Sep 29, 2013 at 9:47 AM, Andreas Owen <ao...@conx.ch> wrote:
> 
>> how dum can you get. obviously quite dum... i would have to analyze the
>> html-pages with a nested instance like this:
>> 
>> <entity name="rec" processor="XPathEntityProcessor"
>> url="file:///C:\ColdFusion10\cfusion\solr\solr\tkbintranet\docImportUrl.xml"
>> forEach="/docs/doc" dataSource="main">
>> 
>>                <entity name="htm" processor="XPathEntityProcessor"
>> url="${rec.urlParse}" forEach="/xhtml:html" dataSource="dataUrl">
>>                        <field column="text" xpath="//content" />
>>                        <field column="h_2" xpath="//body" />
>>                        <field column="text_nohtml" xpath="//text" />
>>                        <field column="h_1" xpath="//h:h1" />
>>                </entity>
>> </entity>
>> 
>> but i'm pretty sure the foreach is wrong and the xpath expressions. in the
>> moment i getting the following error:
>> 
>>        Caused by: java.lang.RuntimeException:
>> org.apache.solr.handler.dataimport.DataImportHandlerException:
>> java.lang.ClassCastException:
>> sun.net.www.protocol.http.HttpURLConnection$HttpInputStream cannot be cast
>> to java.io.Reader
>> 
>> 
>> 
>> 
>> 
>> On 28. Sep 2013, at 1:39 AM, Andreas Owen wrote:
>> 
>>> ok i see what your getting at but why doesn't the following work:
>>> 
>>>      <field xpath="//h:h1" column="h_1" />
>>>      <field column="text" xpath="/xhtml:html/xhtml:body" />
>>> 
>>> i removed the tiki-processor. what am i missing, i haven't found
>> anything in the wiki?
>>> 
>>> 
>>> On 28. Sep 2013, at 12:28 AM, P Williams wrote:
>>> 
>>>> I spent some more time thinking about this.  Do you really need to use
>> the
>>>> TikaEntityProcessor?  It doesn't offer anything new to the document you
>> are
>>>> building that couldn't be accomplished by the XPathEntityProcessor alone
>>>> from what I can tell.
>>>> 
>>>> I also tried to get the Advanced
>>>> Parsing<http://wiki.apache.org/solr/TikaEntityProcessor>example to
>>>> work without success.  There are some obvious typos (<document>
>>>> instead of </document>) and an odd order to the pieces (<dataSources> is
>>>> enclosed by <document>).  It also looks like
>>>> FieldStreamDataSource<
>> http://lucene.apache.org/solr/4_3_1/solr-dataimporthandler/org/apache/solr/handler/dataimport/FieldStreamDataSource.html
>>> is
>>>> the one that is meant to work in this context. If Koji is still around
>>>> maybe he could offer some help?  Otherwise this bit of erroneous
>>>> instruction should probably be removed from the wiki.
>>>> 
>>>> Cheers,
>>>> Tricia
>>>> 
>>>> $ svn diff
>>>> Index:
>>>> 
>> solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
>>>> ===================================================================
>>>> ---
>>>> 
>> solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
>>>>   (revision 1526990)
>>>> +++
>>>> 
>> solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
>>>>   (working copy)
>>>> @@ -99,13 +99,13 @@
>>>>   runFullImport(getConfigHTML("identity"));
>>>>   assertQ(req("*:*"), testsHTMLIdentity);
>>>> }
>>>> -
>>>> +
>>>> private String getConfigHTML(String htmlMapper) {
>>>>   return
>>>>       "<dataConfig>" +
>>>>           "  <dataSource type='BinFileDataSource'/>" +
>>>>           "  <document>" +
>>>> -            "    <entity name='Tika' format='xml'
>>>> processor='TikaEntityProcessor' " +
>>>> +            "    <entity name='Tika' format='html'
>>>> processor='TikaEntityProcessor' " +
>>>>           "       url='" +
>>>> getFile("dihextras/structured.html").getAbsolutePath() + "' " +
>>>>           ((htmlMapper == null) ? "" : (" htmlMapper='" + htmlMapper +
>>>> "'")) + ">" +
>>>>           "      <field column='text'/>" +
>>>> @@ -114,4 +114,36 @@
>>>>           "</dataConfig>";
>>>> 
>>>> }
>>>> +  private String[] testsHTMLH1 = {
>>>> +      "//*[@numFound='1']"
>>>> +      , "//str[@name='h1'][contains(.,'H1 Header')]"
>>>> +  };
>>>> +
>>>> +  @Test
>>>> +  public void testTikaHTMLMapperSubEntity() throws Exception {
>>>> +    runFullImport(getConfigSubEntity("identity"));
>>>> +    assertQ(req("*:*"), testsHTMLH1);
>>>> +  }
>>>> +
>>>> +  private String getConfigSubEntity(String htmlMapper) {
>>>> +    return
>>>> +        "<dataConfig>" +
>>>> +        "<dataSource type='BinFileDataSource' name='bin'/>" +
>>>> +        "<dataSource type='FieldStreamDataSource' name='fld'/>" +
>>>> +        "<document>" +
>>>> +        "<entity name='tika' processor='TikaEntityProcessor' url='" +
>>>> getFile("dihextras/structured.html").getAbsolutePath() + "'
>>>> dataSource='bin' format='html' rootEntity='false'>" +
>>>> +        "<!--Do appropriate mapping here  meta=\"true\" means it is a
>>>> metadata field -->" +
>>>> +        "<field column='Author' meta='true' name='author'/>" +
>>>> +        "<field column='title' meta='true' name='title'/>" +
>>>> +        "<!--'text' is an implicit field emited by TikaEntityProcessor
>> .
>>>> Map it appropriately-->" +
>>>> +        "<field name='text' column='text'/>" +
>>>> +        "<entity name='detail' type='XPathEntityProcessor'
>> forEach='/html'
>>>> dataSource='fld' dataField='tika.text' rootEntity='true' >" +
>>>> +        "<field xpath='//div'  column='foo'/>" +
>>>> +        "<field xpath='//h1'  column='h1' />" +
>>>> +        "</entity>" +
>>>> +        "</entity>" +
>>>> +        "</document>" +
>>>> +        "</dataConfig>";
>>>> +  }
>>>> +
>>>> }
>>>> Index:
>>>> 
>> solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
>>>> ===================================================================
>>>> ---
>>>> 
>> solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
>>>> (revision 1526990)
>>>> +++
>>>> 
>> solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
>>>> (working copy)
>>>> @@ -194,6 +194,8 @@
>>>>  <field name="title" type="string" indexed="true" stored="true"/>
>>>>  <field name="author" type="string" indexed="true" stored="true" />
>>>>  <field name="text" type="text" indexed="true" stored="true" />
>>>> +   <field name="h1" type="text" indexed="true" stored="true" />
>>>> +   <field name="foo" type="text" indexed="true" stored="true" />
>>>> 
>>>> </fields>
>>>> <!-- field for the QueryParser to use when an explicit fieldname is
>>>> absent -->
>>>> 
>>>> 
>>>> I find the SqlEntityProcessor part particularly odd.  That's the default
>>>> right?:
>>>> 2405 T12 C1 oashd.SqlEntityProcessor.initQuery ERROR The query failed
>>>> 'null' java.lang.RuntimeException: unsupported type : class
>> java.lang.String
>>>> at
>>>> 
>> org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:89)
>>>> at
>>>> 
>> org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:1)
>>>> at
>>>> 
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
>>>> at
>>>> 
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
>>>> at
>>>> 
>> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
>>>> at
>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:469)
>>>> at
>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:495)
>>>> at
>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:408)
>>>> at
>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:323)
>>>> at
>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:231)
>>>> at
>>>> 
>> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:411)
>>>> at
>>>> 
>> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:476)
>>>> at
>>>> 
>> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
>>>> at
>>>> 
>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
>>>> at org.apache.solr.core.SolrCore.execute(SolrCore.java:1859)
>>>> at org.apache.solr.util.TestHarness.query(TestHarness.java:291)
>>>> at
>>>> 
>> org.apache.solr.handler.dataimport.AbstractDataImportHandlerTestCase.runFullImport(AbstractDataImportHandlerTestCase.java:96)
>>>> at
>>>> 
>> org.apache.solr.handler.dataimport.TestTikaEntityProcessor.testTikaHTMLMapperSubEntity(TestTikaEntityProcessor.java:124)
>>>> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>>>> at
>>>> 
>> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
>>>> at
>>>> 
>> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>>>> at java.lang.reflect.Method.invoke(Method.java:601)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.RandomizedRunner.invoke(RandomizedRunner.java:1559)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.RandomizedRunner.access$600(RandomizedRunner.java:79)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.RandomizedRunner$6.evaluate(RandomizedRunner.java:737)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.RandomizedRunner$7.evaluate(RandomizedRunner.java:773)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.RandomizedRunner$8.evaluate(RandomizedRunner.java:787)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule$1.evaluate(SystemPropertiesRestoreRule.java:53)
>>>> at
>>>> 
>> org.apache.lucene.util.TestRuleSetupTeardownChained$1.evaluate(TestRuleSetupTeardownChained.java:50)
>>>> at
>>>> 
>> org.apache.lucene.util.TestRuleFieldCacheSanity$1.evaluate(TestRuleFieldCacheSanity.java:51)
>>>> at
>>>> 
>> org.apache.lucene.util.AbstractBeforeAfterRule$1.evaluate(AbstractBeforeAfterRule.java:46)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.rules.SystemPropertiesInvariantRule$1.evaluate(SystemPropertiesInvariantRule.java:55)
>>>> at
>>>> 
>> org.apache.lucene.util.TestRuleThreadAndTestName$1.evaluate(TestRuleThreadAndTestName.java:49)
>>>> at
>>>> 
>> org.apache.lucene.util.TestRuleIgnoreAfterMaxFailures$1.evaluate(TestRuleIgnoreAfterMaxFailures.java:70)
>>>> at
>>>> 
>> org.apache.lucene.util.TestRuleMarkFailure$1.evaluate(TestRuleMarkFailure.java:48)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.ThreadLeakControl$StatementRunner.run(ThreadLeakControl.java:358)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.ThreadLeakControl.forkTimeoutingTask(ThreadLeakControl.java:782)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.ThreadLeakControl$3.evaluate(ThreadLeakControl.java:442)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.RandomizedRunner.runSingleTest(RandomizedRunner.java:746)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.RandomizedRunner$3.evaluate(RandomizedRunner.java:648)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.RandomizedRunner$4.evaluate(RandomizedRunner.java:682)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.RandomizedRunner$5.evaluate(RandomizedRunner.java:693)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule$1.evaluate(SystemPropertiesRestoreRule.java:53)
>>>> at
>>>> 
>> org.apache.lucene.util.AbstractBeforeAfterRule$1.evaluate(AbstractBeforeAfterRule.java:46)
>>>> at
>>>> 
>> org.apache.lucene.util.TestRuleStoreClassName$1.evaluate(TestRuleStoreClassName.java:42)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.rules.SystemPropertiesInvariantRule$1.evaluate(SystemPropertiesInvariantRule.java:55)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.rules.NoShadowingOrOverridesOnMethodsRule$1.evaluate(NoShadowingOrOverridesOnMethodsRule.java:39)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.rules.NoShadowingOrOverridesOnMethodsRule$1.evaluate(NoShadowingOrOverridesOnMethodsRule.java:39)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
>>>> at
>>>> 
>> org.apache.lucene.util.TestRuleAssertionsRequired$1.evaluate(TestRuleAssertionsRequired.java:43)
>>>> at
>>>> 
>> org.apache.lucene.util.TestRuleMarkFailure$1.evaluate(TestRuleMarkFailure.java:48)
>>>> at
>>>> 
>> org.apache.lucene.util.TestRuleIgnoreAfterMaxFailures$1.evaluate(TestRuleIgnoreAfterMaxFailures.java:70)
>>>> at
>>>> 
>> org.apache.lucene.util.TestRuleIgnoreTestSuites$1.evaluate(TestRuleIgnoreTestSuites.java:55)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
>>>> at
>>>> 
>> com.carrotsearch.randomizedtesting.ThreadLeakControl$StatementRunner.run(ThreadLeakControl.java:358)
>>>> at java.lang.Thread.run(Thread.java:722)
>>>> 
>>>> 
>>>> 
>>>> On Fri, Sep 27, 2013 at 3:55 AM, Andreas Owen <ao...@conx.ch> wrote:
>>>> 
>>>>> i removed the FieldReaderDataSource and dataSource="fld" but it didn't
>>>>> help. i get the following for each document:
>>>>>      DataImportHandlerException: Exception in invoking url null
>>>>> Processing Document # 9
>>>>>      nullpointerexception
>>>>> 
>>>>> 
>>>>> On 26. Sep 2013, at 8:39 PM, P Williams wrote:
>>>>> 
>>>>>> Hi,
>>>>>> 
>>>>>> Haven't tried this myself but maybe try leaving out the
>>>>>> FieldReaderDataSource entirely.  From my quick searching looks like
>> it's
>>>>>> tied to SQL.  Did you try copying the
>>>>>> http://wiki.apache.org/solr/TikaEntityProcessor Advanced Parsing
>> example
>>>>>> exactly?  What happens when you leave out FieldReaderDataSource?
>>>>>> 
>>>>>> Cheers,
>>>>>> Tricia
>>>>>> 
>>>>>> 
>>>>>> On Thu, Sep 26, 2013 at 4:17 AM, Andreas Owen <ao...@conx.ch> wrote:
>>>>>> 
>>>>>>> i'm using solr 4.3.1 and the dataimporter. i am trying to use
>>>>>>> XPathEntityProcessor within the TikaEntityProcessor for indexing
>>>>> html-pages
>>>>>>> but i'm getting this error for each document. i have also tried
>>>>>>> dataField="tika.text" and dataField="text" to no avail. the nested
>>>>>>> XPathEntityProcessor "detail" creates the error, the rest works fine.
>>>>> what
>>>>>>> am i doing wrong?
>>>>>>> 
>>>>>>> error:
>>>>>>> 
>>>>>>> ERROR - 2013-09-26 12:08:49.006;
>>>>>>> org.apache.solr.handler.dataimport.SqlEntityProcessor; The query
>> failed
>>>>>>> 'null'
>>>>>>> java.lang.ClassCastException: java.io.StringReader cannot be cast to
>>>>>>> java.util.Iterator
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
>>>>>>>     at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
>>>>>>>     at org.eclipse.jetty.server.Server.handle(Server.java:365)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
>>>>>>>     at
>>>>> org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
>>>>>>>     at
>>>>>>> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
>>>>>>>     at java.lang.Thread.run(Unknown Source)
>>>>>>> ERROR - 2013-09-26 12:08:49.022;
>> org.apache.solr.common.SolrException;
>>>>>>> Exception in entity :
>>>>>>> detail:org.apache.solr.handler.dataimport.DataImportHandlerException:
>>>>>>> java.lang.ClassCastException: java.io.StringReader cannot be cast to
>>>>>>> java.util.Iterator
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:65)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
>>>>>>>     at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
>>>>>>>     at org.eclipse.jetty.server.Server.handle(Server.java:365)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
>>>>>>>     at
>>>>> org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
>>>>>>>     at
>>>>>>> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
>>>>>>>     at java.lang.Thread.run(Unknown Source)
>>>>>>> Caused by: java.lang.ClassCastException: java.io.StringReader cannot
>> be
>>>>>>> cast to java.util.Iterator
>>>>>>>     at
>>>>>>> 
>>>>> 
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
>>>>>>>     ... 41 more
>>>>>>> 
>>>>>>> 
>>>>>>> 
>>>>>>> data-config.xml
>>>>>>> 
>>>>>>> <dataConfig>
>>>>>>>     <dataSource type="BinURLDataSource" name="dataFile"/>
>>>>>>>     <dataSource type="BinURLDataSource" name="dataUrl"/>
>>>>>>>     <dataSource type="URLDataSource" name="main"/>
>>>>>>>     <dataSource type="FieldReaderDataSource" name="fld"/>
>>>>>>> <document>
>>>>>>> <entity name="rec" processor="XPathEntityProcessor"
>>>>>>> 
>>>>> 
>> url="file:///C:\ColdFusion10\cfusion\solr\solr\tkbintranet\docImportUrl.xml"
>>>>>>> forEach="/docs/doc" dataSource="main">
>>>>>>>             <field column="title" xpath="//title" />
>>>>>>>             <field column="id" xpath="//id" />
>>>>>>>             <field column="file" xpath="//file" />
>>>>>>>             <field column="url" xpath="//url" />
>>>>>>>             <field column="urlParse" xpath="//urlParse" />
>>>>>>>             <field column="last_modified" xpath="//last_modified" />
>>>>>>>             <field column="Author" xpath="//author" />
>>>>>>> 
>>>>>>>             <entity name="tika" processor="TikaEntityProcessor"
>>>>>>> url="${rec.urlParse}" dataSource="dataUrl" onError="skip"
>> format="html">
>>>>>>>                     <field column="text"/>
>>>>>>> 
>>>>>>>                     <entity name="detail"
>> type="XPathEntityProcessor"
>>>>>>> forEach="/html" dataSource="fld" dataField="${tika.text}"
>>>>> rootEntity="true"
>>>>>>> onError="skip">
>>>>>>>                             <field xpath="//h1" column="h_1" />
>>>>>>>                     </entity>
>>>>>>>             </entity>
>>>>>>>     </entity>
>>>>>>> </document>
>>>>>>> </dataConfig>
>>>>> 
>>>>> 
>> 
>> 


Re: XPathEntityProcessor nested in TikaEntityProcessor query null exception

Posted by P Williams <wi...@gmail.com>.
Hi Andreas,

When using XPathEntityProcessor<http://wiki.apache.org/solr/DataImportHandler#XPathEntityProcessor>your
DataSource
must be of type DataSource<Reader>.  You shouldn't be using
BinURLDataSource, it's giving you the cast exception.  Use
URLDataSource<https://builds.apache.org/job/Solr-Artifacts-4.x/javadoc/solr-dataimporthandler/org/apache/solr/handler/dataimport/URLDataSource.html>
or
FileDataSource<https://builds.apache.org/job/Solr-Artifacts-4.x/javadoc/solr-dataimporthandler/org/apache/solr/handler/dataimport/FileDataSource.html>instead.

I don't think you need to specify namespaces, at least you didn't used to.
 The other thing that I've noticed is that the anywhere xpath expression //
doesn't always work in DIH.  You might have to be more specific.

Cheers,
Tricia





On Sun, Sep 29, 2013 at 9:47 AM, Andreas Owen <ao...@conx.ch> wrote:

> how dum can you get. obviously quite dum... i would have to analyze the
> html-pages with a nested instance like this:
>
> <entity name="rec" processor="XPathEntityProcessor"
> url="file:///C:\ColdFusion10\cfusion\solr\solr\tkbintranet\docImportUrl.xml"
> forEach="/docs/doc" dataSource="main">
>
>                 <entity name="htm" processor="XPathEntityProcessor"
> url="${rec.urlParse}" forEach="/xhtml:html" dataSource="dataUrl">
>                         <field column="text" xpath="//content" />
>                         <field column="h_2" xpath="//body" />
>                         <field column="text_nohtml" xpath="//text" />
>                         <field column="h_1" xpath="//h:h1" />
>                 </entity>
> </entity>
>
> but i'm pretty sure the foreach is wrong and the xpath expressions. in the
> moment i getting the following error:
>
>         Caused by: java.lang.RuntimeException:
> org.apache.solr.handler.dataimport.DataImportHandlerException:
> java.lang.ClassCastException:
> sun.net.www.protocol.http.HttpURLConnection$HttpInputStream cannot be cast
> to java.io.Reader
>
>
>
>
>
> On 28. Sep 2013, at 1:39 AM, Andreas Owen wrote:
>
> > ok i see what your getting at but why doesn't the following work:
> >
> >       <field xpath="//h:h1" column="h_1" />
> >       <field column="text" xpath="/xhtml:html/xhtml:body" />
> >
> > i removed the tiki-processor. what am i missing, i haven't found
> anything in the wiki?
> >
> >
> > On 28. Sep 2013, at 12:28 AM, P Williams wrote:
> >
> >> I spent some more time thinking about this.  Do you really need to use
> the
> >> TikaEntityProcessor?  It doesn't offer anything new to the document you
> are
> >> building that couldn't be accomplished by the XPathEntityProcessor alone
> >> from what I can tell.
> >>
> >> I also tried to get the Advanced
> >> Parsing<http://wiki.apache.org/solr/TikaEntityProcessor>example to
> >> work without success.  There are some obvious typos (<document>
> >> instead of </document>) and an odd order to the pieces (<dataSources> is
> >> enclosed by <document>).  It also looks like
> >> FieldStreamDataSource<
> http://lucene.apache.org/solr/4_3_1/solr-dataimporthandler/org/apache/solr/handler/dataimport/FieldStreamDataSource.html
> >is
> >> the one that is meant to work in this context. If Koji is still around
> >> maybe he could offer some help?  Otherwise this bit of erroneous
> >> instruction should probably be removed from the wiki.
> >>
> >> Cheers,
> >> Tricia
> >>
> >> $ svn diff
> >> Index:
> >>
> solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
> >> ===================================================================
> >> ---
> >>
> solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
> >>    (revision 1526990)
> >> +++
> >>
> solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
> >>    (working copy)
> >> @@ -99,13 +99,13 @@
> >>    runFullImport(getConfigHTML("identity"));
> >>    assertQ(req("*:*"), testsHTMLIdentity);
> >>  }
> >> -
> >> +
> >>  private String getConfigHTML(String htmlMapper) {
> >>    return
> >>        "<dataConfig>" +
> >>            "  <dataSource type='BinFileDataSource'/>" +
> >>            "  <document>" +
> >> -            "    <entity name='Tika' format='xml'
> >> processor='TikaEntityProcessor' " +
> >> +            "    <entity name='Tika' format='html'
> >> processor='TikaEntityProcessor' " +
> >>            "       url='" +
> >> getFile("dihextras/structured.html").getAbsolutePath() + "' " +
> >>            ((htmlMapper == null) ? "" : (" htmlMapper='" + htmlMapper +
> >> "'")) + ">" +
> >>            "      <field column='text'/>" +
> >> @@ -114,4 +114,36 @@
> >>            "</dataConfig>";
> >>
> >>  }
> >> +  private String[] testsHTMLH1 = {
> >> +      "//*[@numFound='1']"
> >> +      , "//str[@name='h1'][contains(.,'H1 Header')]"
> >> +  };
> >> +
> >> +  @Test
> >> +  public void testTikaHTMLMapperSubEntity() throws Exception {
> >> +    runFullImport(getConfigSubEntity("identity"));
> >> +    assertQ(req("*:*"), testsHTMLH1);
> >> +  }
> >> +
> >> +  private String getConfigSubEntity(String htmlMapper) {
> >> +    return
> >> +        "<dataConfig>" +
> >> +        "<dataSource type='BinFileDataSource' name='bin'/>" +
> >> +        "<dataSource type='FieldStreamDataSource' name='fld'/>" +
> >> +        "<document>" +
> >> +        "<entity name='tika' processor='TikaEntityProcessor' url='" +
> >> getFile("dihextras/structured.html").getAbsolutePath() + "'
> >> dataSource='bin' format='html' rootEntity='false'>" +
> >> +        "<!--Do appropriate mapping here  meta=\"true\" means it is a
> >> metadata field -->" +
> >> +        "<field column='Author' meta='true' name='author'/>" +
> >> +        "<field column='title' meta='true' name='title'/>" +
> >> +        "<!--'text' is an implicit field emited by TikaEntityProcessor
> .
> >> Map it appropriately-->" +
> >> +        "<field name='text' column='text'/>" +
> >> +        "<entity name='detail' type='XPathEntityProcessor'
> forEach='/html'
> >> dataSource='fld' dataField='tika.text' rootEntity='true' >" +
> >> +        "<field xpath='//div'  column='foo'/>" +
> >> +        "<field xpath='//h1'  column='h1' />" +
> >> +        "</entity>" +
> >> +        "</entity>" +
> >> +        "</document>" +
> >> +        "</dataConfig>";
> >> +  }
> >> +
> >> }
> >> Index:
> >>
> solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
> >> ===================================================================
> >> ---
> >>
> solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
> >>  (revision 1526990)
> >> +++
> >>
> solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
> >>  (working copy)
> >> @@ -194,6 +194,8 @@
> >>   <field name="title" type="string" indexed="true" stored="true"/>
> >>   <field name="author" type="string" indexed="true" stored="true" />
> >>   <field name="text" type="text" indexed="true" stored="true" />
> >> +   <field name="h1" type="text" indexed="true" stored="true" />
> >> +   <field name="foo" type="text" indexed="true" stored="true" />
> >>
> >> </fields>
> >> <!-- field for the QueryParser to use when an explicit fieldname is
> >> absent -->
> >>
> >>
> >> I find the SqlEntityProcessor part particularly odd.  That's the default
> >> right?:
> >> 2405 T12 C1 oashd.SqlEntityProcessor.initQuery ERROR The query failed
> >> 'null' java.lang.RuntimeException: unsupported type : class
> java.lang.String
> >> at
> >>
> org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:89)
> >> at
> >>
> org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:1)
> >> at
> >>
> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
> >> at
> >>
> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
> >> at
> >>
> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
> >> at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:469)
> >> at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:495)
> >> at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:408)
> >> at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:323)
> >> at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:231)
> >> at
> >>
> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:411)
> >> at
> >>
> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:476)
> >> at
> >>
> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
> >> at
> >>
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
> >> at org.apache.solr.core.SolrCore.execute(SolrCore.java:1859)
> >> at org.apache.solr.util.TestHarness.query(TestHarness.java:291)
> >> at
> >>
> org.apache.solr.handler.dataimport.AbstractDataImportHandlerTestCase.runFullImport(AbstractDataImportHandlerTestCase.java:96)
> >> at
> >>
> org.apache.solr.handler.dataimport.TestTikaEntityProcessor.testTikaHTMLMapperSubEntity(TestTikaEntityProcessor.java:124)
> >> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> >> at
> >>
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
> >> at
> >>
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> >> at java.lang.reflect.Method.invoke(Method.java:601)
> >> at
> >>
> com.carrotsearch.randomizedtesting.RandomizedRunner.invoke(RandomizedRunner.java:1559)
> >> at
> >>
> com.carrotsearch.randomizedtesting.RandomizedRunner.access$600(RandomizedRunner.java:79)
> >> at
> >>
> com.carrotsearch.randomizedtesting.RandomizedRunner$6.evaluate(RandomizedRunner.java:737)
> >> at
> >>
> com.carrotsearch.randomizedtesting.RandomizedRunner$7.evaluate(RandomizedRunner.java:773)
> >> at
> >>
> com.carrotsearch.randomizedtesting.RandomizedRunner$8.evaluate(RandomizedRunner.java:787)
> >> at
> >>
> com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule$1.evaluate(SystemPropertiesRestoreRule.java:53)
> >> at
> >>
> org.apache.lucene.util.TestRuleSetupTeardownChained$1.evaluate(TestRuleSetupTeardownChained.java:50)
> >> at
> >>
> org.apache.lucene.util.TestRuleFieldCacheSanity$1.evaluate(TestRuleFieldCacheSanity.java:51)
> >> at
> >>
> org.apache.lucene.util.AbstractBeforeAfterRule$1.evaluate(AbstractBeforeAfterRule.java:46)
> >> at
> >>
> com.carrotsearch.randomizedtesting.rules.SystemPropertiesInvariantRule$1.evaluate(SystemPropertiesInvariantRule.java:55)
> >> at
> >>
> org.apache.lucene.util.TestRuleThreadAndTestName$1.evaluate(TestRuleThreadAndTestName.java:49)
> >> at
> >>
> org.apache.lucene.util.TestRuleIgnoreAfterMaxFailures$1.evaluate(TestRuleIgnoreAfterMaxFailures.java:70)
> >> at
> >>
> org.apache.lucene.util.TestRuleMarkFailure$1.evaluate(TestRuleMarkFailure.java:48)
> >> at
> >>
> com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
> >> at
> >>
> com.carrotsearch.randomizedtesting.ThreadLeakControl$StatementRunner.run(ThreadLeakControl.java:358)
> >> at
> >>
> com.carrotsearch.randomizedtesting.ThreadLeakControl.forkTimeoutingTask(ThreadLeakControl.java:782)
> >> at
> >>
> com.carrotsearch.randomizedtesting.ThreadLeakControl$3.evaluate(ThreadLeakControl.java:442)
> >> at
> >>
> com.carrotsearch.randomizedtesting.RandomizedRunner.runSingleTest(RandomizedRunner.java:746)
> >> at
> >>
> com.carrotsearch.randomizedtesting.RandomizedRunner$3.evaluate(RandomizedRunner.java:648)
> >> at
> >>
> com.carrotsearch.randomizedtesting.RandomizedRunner$4.evaluate(RandomizedRunner.java:682)
> >> at
> >>
> com.carrotsearch.randomizedtesting.RandomizedRunner$5.evaluate(RandomizedRunner.java:693)
> >> at
> >>
> com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
> >> at
> >>
> com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule$1.evaluate(SystemPropertiesRestoreRule.java:53)
> >> at
> >>
> org.apache.lucene.util.AbstractBeforeAfterRule$1.evaluate(AbstractBeforeAfterRule.java:46)
> >> at
> >>
> org.apache.lucene.util.TestRuleStoreClassName$1.evaluate(TestRuleStoreClassName.java:42)
> >> at
> >>
> com.carrotsearch.randomizedtesting.rules.SystemPropertiesInvariantRule$1.evaluate(SystemPropertiesInvariantRule.java:55)
> >> at
> >>
> com.carrotsearch.randomizedtesting.rules.NoShadowingOrOverridesOnMethodsRule$1.evaluate(NoShadowingOrOverridesOnMethodsRule.java:39)
> >> at
> >>
> com.carrotsearch.randomizedtesting.rules.NoShadowingOrOverridesOnMethodsRule$1.evaluate(NoShadowingOrOverridesOnMethodsRule.java:39)
> >> at
> >>
> com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
> >> at
> >>
> org.apache.lucene.util.TestRuleAssertionsRequired$1.evaluate(TestRuleAssertionsRequired.java:43)
> >> at
> >>
> org.apache.lucene.util.TestRuleMarkFailure$1.evaluate(TestRuleMarkFailure.java:48)
> >> at
> >>
> org.apache.lucene.util.TestRuleIgnoreAfterMaxFailures$1.evaluate(TestRuleIgnoreAfterMaxFailures.java:70)
> >> at
> >>
> org.apache.lucene.util.TestRuleIgnoreTestSuites$1.evaluate(TestRuleIgnoreTestSuites.java:55)
> >> at
> >>
> com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
> >> at
> >>
> com.carrotsearch.randomizedtesting.ThreadLeakControl$StatementRunner.run(ThreadLeakControl.java:358)
> >> at java.lang.Thread.run(Thread.java:722)
> >>
> >>
> >>
> >> On Fri, Sep 27, 2013 at 3:55 AM, Andreas Owen <ao...@conx.ch> wrote:
> >>
> >>> i removed the FieldReaderDataSource and dataSource="fld" but it didn't
> >>> help. i get the following for each document:
> >>>       DataImportHandlerException: Exception in invoking url null
> >>> Processing Document # 9
> >>>       nullpointerexception
> >>>
> >>>
> >>> On 26. Sep 2013, at 8:39 PM, P Williams wrote:
> >>>
> >>>> Hi,
> >>>>
> >>>> Haven't tried this myself but maybe try leaving out the
> >>>> FieldReaderDataSource entirely.  From my quick searching looks like
> it's
> >>>> tied to SQL.  Did you try copying the
> >>>> http://wiki.apache.org/solr/TikaEntityProcessor Advanced Parsing
> example
> >>>> exactly?  What happens when you leave out FieldReaderDataSource?
> >>>>
> >>>> Cheers,
> >>>> Tricia
> >>>>
> >>>>
> >>>> On Thu, Sep 26, 2013 at 4:17 AM, Andreas Owen <ao...@conx.ch> wrote:
> >>>>
> >>>>> i'm using solr 4.3.1 and the dataimporter. i am trying to use
> >>>>> XPathEntityProcessor within the TikaEntityProcessor for indexing
> >>> html-pages
> >>>>> but i'm getting this error for each document. i have also tried
> >>>>> dataField="tika.text" and dataField="text" to no avail. the nested
> >>>>> XPathEntityProcessor "detail" creates the error, the rest works fine.
> >>> what
> >>>>> am i doing wrong?
> >>>>>
> >>>>> error:
> >>>>>
> >>>>> ERROR - 2013-09-26 12:08:49.006;
> >>>>> org.apache.solr.handler.dataimport.SqlEntityProcessor; The query
> failed
> >>>>> 'null'
> >>>>> java.lang.ClassCastException: java.io.StringReader cannot be cast to
> >>>>> java.util.Iterator
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
> >>>>>      at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
> >>>>>      at org.eclipse.jetty.server.Server.handle(Server.java:365)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
> >>>>>      at
> >>> org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
> >>>>>      at
> >>>>> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
> >>>>>      at java.lang.Thread.run(Unknown Source)
> >>>>> ERROR - 2013-09-26 12:08:49.022;
> org.apache.solr.common.SolrException;
> >>>>> Exception in entity :
> >>>>> detail:org.apache.solr.handler.dataimport.DataImportHandlerException:
> >>>>> java.lang.ClassCastException: java.io.StringReader cannot be cast to
> >>>>> java.util.Iterator
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:65)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
> >>>>>      at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
> >>>>>      at org.eclipse.jetty.server.Server.handle(Server.java:365)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
> >>>>>      at
> >>> org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
> >>>>>      at
> >>>>> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
> >>>>>      at
> >>>>>
> >>>
> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
> >>>>>      at java.lang.Thread.run(Unknown Source)
> >>>>> Caused by: java.lang.ClassCastException: java.io.StringReader cannot
> be
> >>>>> cast to java.util.Iterator
> >>>>>      at
> >>>>>
> >>>
> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
> >>>>>      ... 41 more
> >>>>>
> >>>>>
> >>>>>
> >>>>> data-config.xml
> >>>>>
> >>>>> <dataConfig>
> >>>>>      <dataSource type="BinURLDataSource" name="dataFile"/>
> >>>>>      <dataSource type="BinURLDataSource" name="dataUrl"/>
> >>>>>      <dataSource type="URLDataSource" name="main"/>
> >>>>>      <dataSource type="FieldReaderDataSource" name="fld"/>
> >>>>> <document>
> >>>>> <entity name="rec" processor="XPathEntityProcessor"
> >>>>>
> >>>
> url="file:///C:\ColdFusion10\cfusion\solr\solr\tkbintranet\docImportUrl.xml"
> >>>>> forEach="/docs/doc" dataSource="main">
> >>>>>              <field column="title" xpath="//title" />
> >>>>>              <field column="id" xpath="//id" />
> >>>>>              <field column="file" xpath="//file" />
> >>>>>              <field column="url" xpath="//url" />
> >>>>>              <field column="urlParse" xpath="//urlParse" />
> >>>>>              <field column="last_modified" xpath="//last_modified" />
> >>>>>              <field column="Author" xpath="//author" />
> >>>>>
> >>>>>              <entity name="tika" processor="TikaEntityProcessor"
> >>>>> url="${rec.urlParse}" dataSource="dataUrl" onError="skip"
> format="html">
> >>>>>                      <field column="text"/>
> >>>>>
> >>>>>                      <entity name="detail"
> type="XPathEntityProcessor"
> >>>>> forEach="/html" dataSource="fld" dataField="${tika.text}"
> >>> rootEntity="true"
> >>>>> onError="skip">
> >>>>>                              <field xpath="//h1" column="h_1" />
> >>>>>                      </entity>
> >>>>>              </entity>
> >>>>>      </entity>
> >>>>> </document>
> >>>>> </dataConfig>
> >>>
> >>>
>
>

Re: XPathEntityProcessor nested in TikaEntityProcessor query null exception

Posted by Andreas Owen <ao...@conx.ch>.
how dum can you get. obviously quite dum... i would have to analyze the html-pages with a nested instance like this:

<entity name="rec" processor="XPathEntityProcessor" url="file:///C:\ColdFusion10\cfusion\solr\solr\tkbintranet\docImportUrl.xml" forEach="/docs/doc" dataSource="main"> 
	
		<entity name="htm" processor="XPathEntityProcessor" url="${rec.urlParse}" forEach="/xhtml:html" dataSource="dataUrl">
			<field column="text" xpath="//content" />
			<field column="h_2" xpath="//body" />
			<field column="text_nohtml" xpath="//text" />
			<field column="h_1" xpath="//h:h1" />
		</entity>
</entity>

but i'm pretty sure the foreach is wrong and the xpath expressions. in the moment i getting the following error:
	
	Caused by: java.lang.RuntimeException: org.apache.solr.handler.dataimport.DataImportHandlerException: java.lang.ClassCastException: sun.net.www.protocol.http.HttpURLConnection$HttpInputStream cannot be cast to java.io.Reader





On 28. Sep 2013, at 1:39 AM, Andreas Owen wrote:

> ok i see what your getting at but why doesn't the following work:
> 	
> 	<field xpath="//h:h1" column="h_1" />
> 	<field column="text" xpath="/xhtml:html/xhtml:body" />
> 
> i removed the tiki-processor. what am i missing, i haven't found anything in the wiki?
> 
> 
> On 28. Sep 2013, at 12:28 AM, P Williams wrote:
> 
>> I spent some more time thinking about this.  Do you really need to use the
>> TikaEntityProcessor?  It doesn't offer anything new to the document you are
>> building that couldn't be accomplished by the XPathEntityProcessor alone
>> from what I can tell.
>> 
>> I also tried to get the Advanced
>> Parsing<http://wiki.apache.org/solr/TikaEntityProcessor>example to
>> work without success.  There are some obvious typos (<document>
>> instead of </document>) and an odd order to the pieces (<dataSources> is
>> enclosed by <document>).  It also looks like
>> FieldStreamDataSource<http://lucene.apache.org/solr/4_3_1/solr-dataimporthandler/org/apache/solr/handler/dataimport/FieldStreamDataSource.html>is
>> the one that is meant to work in this context. If Koji is still around
>> maybe he could offer some help?  Otherwise this bit of erroneous
>> instruction should probably be removed from the wiki.
>> 
>> Cheers,
>> Tricia
>> 
>> $ svn diff
>> Index:
>> solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
>> ===================================================================
>> ---
>> solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
>>    (revision 1526990)
>> +++
>> solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
>>    (working copy)
>> @@ -99,13 +99,13 @@
>>    runFullImport(getConfigHTML("identity"));
>>    assertQ(req("*:*"), testsHTMLIdentity);
>>  }
>> -
>> +
>>  private String getConfigHTML(String htmlMapper) {
>>    return
>>        "<dataConfig>" +
>>            "  <dataSource type='BinFileDataSource'/>" +
>>            "  <document>" +
>> -            "    <entity name='Tika' format='xml'
>> processor='TikaEntityProcessor' " +
>> +            "    <entity name='Tika' format='html'
>> processor='TikaEntityProcessor' " +
>>            "       url='" +
>> getFile("dihextras/structured.html").getAbsolutePath() + "' " +
>>            ((htmlMapper == null) ? "" : (" htmlMapper='" + htmlMapper +
>> "'")) + ">" +
>>            "      <field column='text'/>" +
>> @@ -114,4 +114,36 @@
>>            "</dataConfig>";
>> 
>>  }
>> +  private String[] testsHTMLH1 = {
>> +      "//*[@numFound='1']"
>> +      , "//str[@name='h1'][contains(.,'H1 Header')]"
>> +  };
>> +
>> +  @Test
>> +  public void testTikaHTMLMapperSubEntity() throws Exception {
>> +    runFullImport(getConfigSubEntity("identity"));
>> +    assertQ(req("*:*"), testsHTMLH1);
>> +  }
>> +
>> +  private String getConfigSubEntity(String htmlMapper) {
>> +    return
>> +        "<dataConfig>" +
>> +        "<dataSource type='BinFileDataSource' name='bin'/>" +
>> +        "<dataSource type='FieldStreamDataSource' name='fld'/>" +
>> +        "<document>" +
>> +        "<entity name='tika' processor='TikaEntityProcessor' url='" +
>> getFile("dihextras/structured.html").getAbsolutePath() + "'
>> dataSource='bin' format='html' rootEntity='false'>" +
>> +        "<!--Do appropriate mapping here  meta=\"true\" means it is a
>> metadata field -->" +
>> +        "<field column='Author' meta='true' name='author'/>" +
>> +        "<field column='title' meta='true' name='title'/>" +
>> +        "<!--'text' is an implicit field emited by TikaEntityProcessor .
>> Map it appropriately-->" +
>> +        "<field name='text' column='text'/>" +
>> +        "<entity name='detail' type='XPathEntityProcessor' forEach='/html'
>> dataSource='fld' dataField='tika.text' rootEntity='true' >" +
>> +        "<field xpath='//div'  column='foo'/>" +
>> +        "<field xpath='//h1'  column='h1' />" +
>> +        "</entity>" +
>> +        "</entity>" +
>> +        "</document>" +
>> +        "</dataConfig>";
>> +  }
>> +
>> }
>> Index:
>> solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
>> ===================================================================
>> ---
>> solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
>>  (revision 1526990)
>> +++
>> solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
>>  (working copy)
>> @@ -194,6 +194,8 @@
>>   <field name="title" type="string" indexed="true" stored="true"/>
>>   <field name="author" type="string" indexed="true" stored="true" />
>>   <field name="text" type="text" indexed="true" stored="true" />
>> +   <field name="h1" type="text" indexed="true" stored="true" />
>> +   <field name="foo" type="text" indexed="true" stored="true" />
>> 
>> </fields>
>> <!-- field for the QueryParser to use when an explicit fieldname is
>> absent -->
>> 
>> 
>> I find the SqlEntityProcessor part particularly odd.  That's the default
>> right?:
>> 2405 T12 C1 oashd.SqlEntityProcessor.initQuery ERROR The query failed
>> 'null' java.lang.RuntimeException: unsupported type : class java.lang.String
>> at
>> org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:89)
>> at
>> org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:1)
>> at
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
>> at
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
>> at
>> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
>> at
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:469)
>> at
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:495)
>> at
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:408)
>> at
>> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:323)
>> at
>> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:231)
>> at
>> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:411)
>> at
>> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:476)
>> at
>> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
>> at
>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
>> at org.apache.solr.core.SolrCore.execute(SolrCore.java:1859)
>> at org.apache.solr.util.TestHarness.query(TestHarness.java:291)
>> at
>> org.apache.solr.handler.dataimport.AbstractDataImportHandlerTestCase.runFullImport(AbstractDataImportHandlerTestCase.java:96)
>> at
>> org.apache.solr.handler.dataimport.TestTikaEntityProcessor.testTikaHTMLMapperSubEntity(TestTikaEntityProcessor.java:124)
>> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>> at
>> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
>> at
>> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>> at java.lang.reflect.Method.invoke(Method.java:601)
>> at
>> com.carrotsearch.randomizedtesting.RandomizedRunner.invoke(RandomizedRunner.java:1559)
>> at
>> com.carrotsearch.randomizedtesting.RandomizedRunner.access$600(RandomizedRunner.java:79)
>> at
>> com.carrotsearch.randomizedtesting.RandomizedRunner$6.evaluate(RandomizedRunner.java:737)
>> at
>> com.carrotsearch.randomizedtesting.RandomizedRunner$7.evaluate(RandomizedRunner.java:773)
>> at
>> com.carrotsearch.randomizedtesting.RandomizedRunner$8.evaluate(RandomizedRunner.java:787)
>> at
>> com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule$1.evaluate(SystemPropertiesRestoreRule.java:53)
>> at
>> org.apache.lucene.util.TestRuleSetupTeardownChained$1.evaluate(TestRuleSetupTeardownChained.java:50)
>> at
>> org.apache.lucene.util.TestRuleFieldCacheSanity$1.evaluate(TestRuleFieldCacheSanity.java:51)
>> at
>> org.apache.lucene.util.AbstractBeforeAfterRule$1.evaluate(AbstractBeforeAfterRule.java:46)
>> at
>> com.carrotsearch.randomizedtesting.rules.SystemPropertiesInvariantRule$1.evaluate(SystemPropertiesInvariantRule.java:55)
>> at
>> org.apache.lucene.util.TestRuleThreadAndTestName$1.evaluate(TestRuleThreadAndTestName.java:49)
>> at
>> org.apache.lucene.util.TestRuleIgnoreAfterMaxFailures$1.evaluate(TestRuleIgnoreAfterMaxFailures.java:70)
>> at
>> org.apache.lucene.util.TestRuleMarkFailure$1.evaluate(TestRuleMarkFailure.java:48)
>> at
>> com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
>> at
>> com.carrotsearch.randomizedtesting.ThreadLeakControl$StatementRunner.run(ThreadLeakControl.java:358)
>> at
>> com.carrotsearch.randomizedtesting.ThreadLeakControl.forkTimeoutingTask(ThreadLeakControl.java:782)
>> at
>> com.carrotsearch.randomizedtesting.ThreadLeakControl$3.evaluate(ThreadLeakControl.java:442)
>> at
>> com.carrotsearch.randomizedtesting.RandomizedRunner.runSingleTest(RandomizedRunner.java:746)
>> at
>> com.carrotsearch.randomizedtesting.RandomizedRunner$3.evaluate(RandomizedRunner.java:648)
>> at
>> com.carrotsearch.randomizedtesting.RandomizedRunner$4.evaluate(RandomizedRunner.java:682)
>> at
>> com.carrotsearch.randomizedtesting.RandomizedRunner$5.evaluate(RandomizedRunner.java:693)
>> at
>> com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
>> at
>> com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule$1.evaluate(SystemPropertiesRestoreRule.java:53)
>> at
>> org.apache.lucene.util.AbstractBeforeAfterRule$1.evaluate(AbstractBeforeAfterRule.java:46)
>> at
>> org.apache.lucene.util.TestRuleStoreClassName$1.evaluate(TestRuleStoreClassName.java:42)
>> at
>> com.carrotsearch.randomizedtesting.rules.SystemPropertiesInvariantRule$1.evaluate(SystemPropertiesInvariantRule.java:55)
>> at
>> com.carrotsearch.randomizedtesting.rules.NoShadowingOrOverridesOnMethodsRule$1.evaluate(NoShadowingOrOverridesOnMethodsRule.java:39)
>> at
>> com.carrotsearch.randomizedtesting.rules.NoShadowingOrOverridesOnMethodsRule$1.evaluate(NoShadowingOrOverridesOnMethodsRule.java:39)
>> at
>> com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
>> at
>> org.apache.lucene.util.TestRuleAssertionsRequired$1.evaluate(TestRuleAssertionsRequired.java:43)
>> at
>> org.apache.lucene.util.TestRuleMarkFailure$1.evaluate(TestRuleMarkFailure.java:48)
>> at
>> org.apache.lucene.util.TestRuleIgnoreAfterMaxFailures$1.evaluate(TestRuleIgnoreAfterMaxFailures.java:70)
>> at
>> org.apache.lucene.util.TestRuleIgnoreTestSuites$1.evaluate(TestRuleIgnoreTestSuites.java:55)
>> at
>> com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
>> at
>> com.carrotsearch.randomizedtesting.ThreadLeakControl$StatementRunner.run(ThreadLeakControl.java:358)
>> at java.lang.Thread.run(Thread.java:722)
>> 
>> 
>> 
>> On Fri, Sep 27, 2013 at 3:55 AM, Andreas Owen <ao...@conx.ch> wrote:
>> 
>>> i removed the FieldReaderDataSource and dataSource="fld" but it didn't
>>> help. i get the following for each document:
>>>       DataImportHandlerException: Exception in invoking url null
>>> Processing Document # 9
>>>       nullpointerexception
>>> 
>>> 
>>> On 26. Sep 2013, at 8:39 PM, P Williams wrote:
>>> 
>>>> Hi,
>>>> 
>>>> Haven't tried this myself but maybe try leaving out the
>>>> FieldReaderDataSource entirely.  From my quick searching looks like it's
>>>> tied to SQL.  Did you try copying the
>>>> http://wiki.apache.org/solr/TikaEntityProcessor Advanced Parsing example
>>>> exactly?  What happens when you leave out FieldReaderDataSource?
>>>> 
>>>> Cheers,
>>>> Tricia
>>>> 
>>>> 
>>>> On Thu, Sep 26, 2013 at 4:17 AM, Andreas Owen <ao...@conx.ch> wrote:
>>>> 
>>>>> i'm using solr 4.3.1 and the dataimporter. i am trying to use
>>>>> XPathEntityProcessor within the TikaEntityProcessor for indexing
>>> html-pages
>>>>> but i'm getting this error for each document. i have also tried
>>>>> dataField="tika.text" and dataField="text" to no avail. the nested
>>>>> XPathEntityProcessor "detail" creates the error, the rest works fine.
>>> what
>>>>> am i doing wrong?
>>>>> 
>>>>> error:
>>>>> 
>>>>> ERROR - 2013-09-26 12:08:49.006;
>>>>> org.apache.solr.handler.dataimport.SqlEntityProcessor; The query failed
>>>>> 'null'
>>>>> java.lang.ClassCastException: java.io.StringReader cannot be cast to
>>>>> java.util.Iterator
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
>>>>>      at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
>>>>>      at
>>>>> 
>>> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
>>>>>      at
>>>>> 
>>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
>>>>>      at
>>>>> 
>>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
>>>>>      at org.eclipse.jetty.server.Server.handle(Server.java:365)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
>>>>>      at
>>> org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
>>>>>      at
>>>>> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
>>>>>      at java.lang.Thread.run(Unknown Source)
>>>>> ERROR - 2013-09-26 12:08:49.022; org.apache.solr.common.SolrException;
>>>>> Exception in entity :
>>>>> detail:org.apache.solr.handler.dataimport.DataImportHandlerException:
>>>>> java.lang.ClassCastException: java.io.StringReader cannot be cast to
>>>>> java.util.Iterator
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:65)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
>>>>>      at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
>>>>>      at
>>>>> 
>>> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
>>>>>      at
>>>>> 
>>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
>>>>>      at
>>>>> 
>>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
>>>>>      at org.eclipse.jetty.server.Server.handle(Server.java:365)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
>>>>>      at
>>> org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
>>>>>      at
>>>>> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
>>>>>      at
>>>>> 
>>> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
>>>>>      at java.lang.Thread.run(Unknown Source)
>>>>> Caused by: java.lang.ClassCastException: java.io.StringReader cannot be
>>>>> cast to java.util.Iterator
>>>>>      at
>>>>> 
>>> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
>>>>>      ... 41 more
>>>>> 
>>>>> 
>>>>> 
>>>>> data-config.xml
>>>>> 
>>>>> <dataConfig>
>>>>>      <dataSource type="BinURLDataSource" name="dataFile"/>
>>>>>      <dataSource type="BinURLDataSource" name="dataUrl"/>
>>>>>      <dataSource type="URLDataSource" name="main"/>
>>>>>      <dataSource type="FieldReaderDataSource" name="fld"/>
>>>>> <document>
>>>>> <entity name="rec" processor="XPathEntityProcessor"
>>>>> 
>>> url="file:///C:\ColdFusion10\cfusion\solr\solr\tkbintranet\docImportUrl.xml"
>>>>> forEach="/docs/doc" dataSource="main">
>>>>>              <field column="title" xpath="//title" />
>>>>>              <field column="id" xpath="//id" />
>>>>>              <field column="file" xpath="//file" />
>>>>>              <field column="url" xpath="//url" />
>>>>>              <field column="urlParse" xpath="//urlParse" />
>>>>>              <field column="last_modified" xpath="//last_modified" />
>>>>>              <field column="Author" xpath="//author" />
>>>>> 
>>>>>              <entity name="tika" processor="TikaEntityProcessor"
>>>>> url="${rec.urlParse}" dataSource="dataUrl" onError="skip" format="html">
>>>>>                      <field column="text"/>
>>>>> 
>>>>>                      <entity name="detail" type="XPathEntityProcessor"
>>>>> forEach="/html" dataSource="fld" dataField="${tika.text}"
>>> rootEntity="true"
>>>>> onError="skip">
>>>>>                              <field xpath="//h1" column="h_1" />
>>>>>                      </entity>
>>>>>              </entity>
>>>>>      </entity>
>>>>> </document>
>>>>> </dataConfig>
>>> 
>>> 


Re: XPathEntityProcessor nested in TikaEntityProcessor query null exception

Posted by Andreas Owen <ao...@conx.ch>.
ok i see what your getting at but why doesn't the following work:
	
	<field xpath="//h:h1" column="h_1" />
	<field column="text" xpath="/xhtml:html/xhtml:body" />

i removed the tiki-processor. what am i missing, i haven't found anything in the wiki?


On 28. Sep 2013, at 12:28 AM, P Williams wrote:

> I spent some more time thinking about this.  Do you really need to use the
> TikaEntityProcessor?  It doesn't offer anything new to the document you are
> building that couldn't be accomplished by the XPathEntityProcessor alone
> from what I can tell.
> 
> I also tried to get the Advanced
> Parsing<http://wiki.apache.org/solr/TikaEntityProcessor>example to
> work without success.  There are some obvious typos (<document>
> instead of </document>) and an odd order to the pieces (<dataSources> is
> enclosed by <document>).  It also looks like
> FieldStreamDataSource<http://lucene.apache.org/solr/4_3_1/solr-dataimporthandler/org/apache/solr/handler/dataimport/FieldStreamDataSource.html>is
> the one that is meant to work in this context. If Koji is still around
> maybe he could offer some help?  Otherwise this bit of erroneous
> instruction should probably be removed from the wiki.
> 
> Cheers,
> Tricia
> 
> $ svn diff
> Index:
> solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
> ===================================================================
> ---
> solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
>     (revision 1526990)
> +++
> solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
>     (working copy)
> @@ -99,13 +99,13 @@
>     runFullImport(getConfigHTML("identity"));
>     assertQ(req("*:*"), testsHTMLIdentity);
>   }
> -
> +
>   private String getConfigHTML(String htmlMapper) {
>     return
>         "<dataConfig>" +
>             "  <dataSource type='BinFileDataSource'/>" +
>             "  <document>" +
> -            "    <entity name='Tika' format='xml'
> processor='TikaEntityProcessor' " +
> +            "    <entity name='Tika' format='html'
> processor='TikaEntityProcessor' " +
>             "       url='" +
> getFile("dihextras/structured.html").getAbsolutePath() + "' " +
>             ((htmlMapper == null) ? "" : (" htmlMapper='" + htmlMapper +
> "'")) + ">" +
>             "      <field column='text'/>" +
> @@ -114,4 +114,36 @@
>             "</dataConfig>";
> 
>   }
> +  private String[] testsHTMLH1 = {
> +      "//*[@numFound='1']"
> +      , "//str[@name='h1'][contains(.,'H1 Header')]"
> +  };
> +
> +  @Test
> +  public void testTikaHTMLMapperSubEntity() throws Exception {
> +    runFullImport(getConfigSubEntity("identity"));
> +    assertQ(req("*:*"), testsHTMLH1);
> +  }
> +
> +  private String getConfigSubEntity(String htmlMapper) {
> +    return
> +        "<dataConfig>" +
> +        "<dataSource type='BinFileDataSource' name='bin'/>" +
> +        "<dataSource type='FieldStreamDataSource' name='fld'/>" +
> +        "<document>" +
> +        "<entity name='tika' processor='TikaEntityProcessor' url='" +
> getFile("dihextras/structured.html").getAbsolutePath() + "'
> dataSource='bin' format='html' rootEntity='false'>" +
> +        "<!--Do appropriate mapping here  meta=\"true\" means it is a
> metadata field -->" +
> +        "<field column='Author' meta='true' name='author'/>" +
> +        "<field column='title' meta='true' name='title'/>" +
> +        "<!--'text' is an implicit field emited by TikaEntityProcessor .
> Map it appropriately-->" +
> +        "<field name='text' column='text'/>" +
> +        "<entity name='detail' type='XPathEntityProcessor' forEach='/html'
> dataSource='fld' dataField='tika.text' rootEntity='true' >" +
> +        "<field xpath='//div'  column='foo'/>" +
> +        "<field xpath='//h1'  column='h1' />" +
> +        "</entity>" +
> +        "</entity>" +
> +        "</document>" +
> +        "</dataConfig>";
> +  }
> +
> }
> Index:
> solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
> ===================================================================
> ---
> solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
>   (revision 1526990)
> +++
> solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
>   (working copy)
> @@ -194,6 +194,8 @@
>    <field name="title" type="string" indexed="true" stored="true"/>
>    <field name="author" type="string" indexed="true" stored="true" />
>    <field name="text" type="text" indexed="true" stored="true" />
> +   <field name="h1" type="text" indexed="true" stored="true" />
> +   <field name="foo" type="text" indexed="true" stored="true" />
> 
>  </fields>
>  <!-- field for the QueryParser to use when an explicit fieldname is
> absent -->
> 
> 
> I find the SqlEntityProcessor part particularly odd.  That's the default
> right?:
> 2405 T12 C1 oashd.SqlEntityProcessor.initQuery ERROR The query failed
> 'null' java.lang.RuntimeException: unsupported type : class java.lang.String
> at
> org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:89)
> at
> org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:1)
> at
> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
> at
> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
> at
> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
> at
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:469)
> at
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:495)
> at
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:408)
> at
> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:323)
> at
> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:231)
> at
> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:411)
> at
> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:476)
> at
> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
> at
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
> at org.apache.solr.core.SolrCore.execute(SolrCore.java:1859)
> at org.apache.solr.util.TestHarness.query(TestHarness.java:291)
> at
> org.apache.solr.handler.dataimport.AbstractDataImportHandlerTestCase.runFullImport(AbstractDataImportHandlerTestCase.java:96)
> at
> org.apache.solr.handler.dataimport.TestTikaEntityProcessor.testTikaHTMLMapperSubEntity(TestTikaEntityProcessor.java:124)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
> at
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:601)
> at
> com.carrotsearch.randomizedtesting.RandomizedRunner.invoke(RandomizedRunner.java:1559)
> at
> com.carrotsearch.randomizedtesting.RandomizedRunner.access$600(RandomizedRunner.java:79)
> at
> com.carrotsearch.randomizedtesting.RandomizedRunner$6.evaluate(RandomizedRunner.java:737)
> at
> com.carrotsearch.randomizedtesting.RandomizedRunner$7.evaluate(RandomizedRunner.java:773)
> at
> com.carrotsearch.randomizedtesting.RandomizedRunner$8.evaluate(RandomizedRunner.java:787)
> at
> com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule$1.evaluate(SystemPropertiesRestoreRule.java:53)
> at
> org.apache.lucene.util.TestRuleSetupTeardownChained$1.evaluate(TestRuleSetupTeardownChained.java:50)
> at
> org.apache.lucene.util.TestRuleFieldCacheSanity$1.evaluate(TestRuleFieldCacheSanity.java:51)
> at
> org.apache.lucene.util.AbstractBeforeAfterRule$1.evaluate(AbstractBeforeAfterRule.java:46)
> at
> com.carrotsearch.randomizedtesting.rules.SystemPropertiesInvariantRule$1.evaluate(SystemPropertiesInvariantRule.java:55)
> at
> org.apache.lucene.util.TestRuleThreadAndTestName$1.evaluate(TestRuleThreadAndTestName.java:49)
> at
> org.apache.lucene.util.TestRuleIgnoreAfterMaxFailures$1.evaluate(TestRuleIgnoreAfterMaxFailures.java:70)
> at
> org.apache.lucene.util.TestRuleMarkFailure$1.evaluate(TestRuleMarkFailure.java:48)
> at
> com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
> at
> com.carrotsearch.randomizedtesting.ThreadLeakControl$StatementRunner.run(ThreadLeakControl.java:358)
> at
> com.carrotsearch.randomizedtesting.ThreadLeakControl.forkTimeoutingTask(ThreadLeakControl.java:782)
> at
> com.carrotsearch.randomizedtesting.ThreadLeakControl$3.evaluate(ThreadLeakControl.java:442)
> at
> com.carrotsearch.randomizedtesting.RandomizedRunner.runSingleTest(RandomizedRunner.java:746)
> at
> com.carrotsearch.randomizedtesting.RandomizedRunner$3.evaluate(RandomizedRunner.java:648)
> at
> com.carrotsearch.randomizedtesting.RandomizedRunner$4.evaluate(RandomizedRunner.java:682)
> at
> com.carrotsearch.randomizedtesting.RandomizedRunner$5.evaluate(RandomizedRunner.java:693)
> at
> com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
> at
> com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule$1.evaluate(SystemPropertiesRestoreRule.java:53)
> at
> org.apache.lucene.util.AbstractBeforeAfterRule$1.evaluate(AbstractBeforeAfterRule.java:46)
> at
> org.apache.lucene.util.TestRuleStoreClassName$1.evaluate(TestRuleStoreClassName.java:42)
> at
> com.carrotsearch.randomizedtesting.rules.SystemPropertiesInvariantRule$1.evaluate(SystemPropertiesInvariantRule.java:55)
> at
> com.carrotsearch.randomizedtesting.rules.NoShadowingOrOverridesOnMethodsRule$1.evaluate(NoShadowingOrOverridesOnMethodsRule.java:39)
> at
> com.carrotsearch.randomizedtesting.rules.NoShadowingOrOverridesOnMethodsRule$1.evaluate(NoShadowingOrOverridesOnMethodsRule.java:39)
> at
> com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
> at
> org.apache.lucene.util.TestRuleAssertionsRequired$1.evaluate(TestRuleAssertionsRequired.java:43)
> at
> org.apache.lucene.util.TestRuleMarkFailure$1.evaluate(TestRuleMarkFailure.java:48)
> at
> org.apache.lucene.util.TestRuleIgnoreAfterMaxFailures$1.evaluate(TestRuleIgnoreAfterMaxFailures.java:70)
> at
> org.apache.lucene.util.TestRuleIgnoreTestSuites$1.evaluate(TestRuleIgnoreTestSuites.java:55)
> at
> com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
> at
> com.carrotsearch.randomizedtesting.ThreadLeakControl$StatementRunner.run(ThreadLeakControl.java:358)
> at java.lang.Thread.run(Thread.java:722)
> 
> 
> 
> On Fri, Sep 27, 2013 at 3:55 AM, Andreas Owen <ao...@conx.ch> wrote:
> 
>> i removed the FieldReaderDataSource and dataSource="fld" but it didn't
>> help. i get the following for each document:
>>        DataImportHandlerException: Exception in invoking url null
>> Processing Document # 9
>>        nullpointerexception
>> 
>> 
>> On 26. Sep 2013, at 8:39 PM, P Williams wrote:
>> 
>>> Hi,
>>> 
>>> Haven't tried this myself but maybe try leaving out the
>>> FieldReaderDataSource entirely.  From my quick searching looks like it's
>>> tied to SQL.  Did you try copying the
>>> http://wiki.apache.org/solr/TikaEntityProcessor Advanced Parsing example
>>> exactly?  What happens when you leave out FieldReaderDataSource?
>>> 
>>> Cheers,
>>> Tricia
>>> 
>>> 
>>> On Thu, Sep 26, 2013 at 4:17 AM, Andreas Owen <ao...@conx.ch> wrote:
>>> 
>>>> i'm using solr 4.3.1 and the dataimporter. i am trying to use
>>>> XPathEntityProcessor within the TikaEntityProcessor for indexing
>> html-pages
>>>> but i'm getting this error for each document. i have also tried
>>>> dataField="tika.text" and dataField="text" to no avail. the nested
>>>> XPathEntityProcessor "detail" creates the error, the rest works fine.
>> what
>>>> am i doing wrong?
>>>> 
>>>> error:
>>>> 
>>>> ERROR - 2013-09-26 12:08:49.006;
>>>> org.apache.solr.handler.dataimport.SqlEntityProcessor; The query failed
>>>> 'null'
>>>> java.lang.ClassCastException: java.io.StringReader cannot be cast to
>>>> java.util.Iterator
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
>>>>       at
>>>> 
>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
>>>>       at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
>>>>       at
>>>> 
>> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
>>>>       at
>>>> 
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
>>>>       at
>>>> 
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
>>>>       at
>>>> 
>> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
>>>>       at
>>>> 
>> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
>>>>       at
>>>> 
>> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
>>>>       at
>>>> 
>> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
>>>>       at org.eclipse.jetty.server.Server.handle(Server.java:365)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
>>>>       at
>> org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
>>>>       at
>>>> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
>>>>       at
>>>> 
>> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
>>>>       at
>>>> 
>> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
>>>>       at java.lang.Thread.run(Unknown Source)
>>>> ERROR - 2013-09-26 12:08:49.022; org.apache.solr.common.SolrException;
>>>> Exception in entity :
>>>> detail:org.apache.solr.handler.dataimport.DataImportHandlerException:
>>>> java.lang.ClassCastException: java.io.StringReader cannot be cast to
>>>> java.util.Iterator
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:65)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
>>>>       at
>>>> 
>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
>>>>       at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
>>>>       at
>>>> 
>> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
>>>>       at
>>>> 
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
>>>>       at
>>>> 
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
>>>>       at
>>>> 
>> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
>>>>       at
>>>> 
>> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
>>>>       at
>>>> 
>> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
>>>>       at
>>>> 
>> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
>>>>       at org.eclipse.jetty.server.Server.handle(Server.java:365)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
>>>>       at
>> org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
>>>>       at
>>>> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
>>>>       at
>>>> 
>> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
>>>>       at
>>>> 
>> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
>>>>       at
>>>> 
>> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
>>>>       at java.lang.Thread.run(Unknown Source)
>>>> Caused by: java.lang.ClassCastException: java.io.StringReader cannot be
>>>> cast to java.util.Iterator
>>>>       at
>>>> 
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
>>>>       ... 41 more
>>>> 
>>>> 
>>>> 
>>>> data-config.xml
>>>> 
>>>> <dataConfig>
>>>>       <dataSource type="BinURLDataSource" name="dataFile"/>
>>>>       <dataSource type="BinURLDataSource" name="dataUrl"/>
>>>>       <dataSource type="URLDataSource" name="main"/>
>>>>       <dataSource type="FieldReaderDataSource" name="fld"/>
>>>> <document>
>>>> <entity name="rec" processor="XPathEntityProcessor"
>>>> 
>> url="file:///C:\ColdFusion10\cfusion\solr\solr\tkbintranet\docImportUrl.xml"
>>>> forEach="/docs/doc" dataSource="main">
>>>>               <field column="title" xpath="//title" />
>>>>               <field column="id" xpath="//id" />
>>>>               <field column="file" xpath="//file" />
>>>>               <field column="url" xpath="//url" />
>>>>               <field column="urlParse" xpath="//urlParse" />
>>>>               <field column="last_modified" xpath="//last_modified" />
>>>>               <field column="Author" xpath="//author" />
>>>> 
>>>>               <entity name="tika" processor="TikaEntityProcessor"
>>>> url="${rec.urlParse}" dataSource="dataUrl" onError="skip" format="html">
>>>>                       <field column="text"/>
>>>> 
>>>>                       <entity name="detail" type="XPathEntityProcessor"
>>>> forEach="/html" dataSource="fld" dataField="${tika.text}"
>> rootEntity="true"
>>>> onError="skip">
>>>>                               <field xpath="//h1" column="h_1" />
>>>>                       </entity>
>>>>               </entity>
>>>>       </entity>
>>>> </document>
>>>> </dataConfig>
>> 
>> 


Re: XPathEntityProcessor nested in TikaEntityProcessor query null exception

Posted by P Williams <wi...@gmail.com>.
I spent some more time thinking about this.  Do you really need to use the
TikaEntityProcessor?  It doesn't offer anything new to the document you are
building that couldn't be accomplished by the XPathEntityProcessor alone
from what I can tell.

I also tried to get the Advanced
Parsing<http://wiki.apache.org/solr/TikaEntityProcessor>example to
work without success.  There are some obvious typos (<document>
instead of </document>) and an odd order to the pieces (<dataSources> is
enclosed by <document>).  It also looks like
FieldStreamDataSource<http://lucene.apache.org/solr/4_3_1/solr-dataimporthandler/org/apache/solr/handler/dataimport/FieldStreamDataSource.html>is
the one that is meant to work in this context. If Koji is still around
maybe he could offer some help?  Otherwise this bit of erroneous
instruction should probably be removed from the wiki.

Cheers,
Tricia

$ svn diff
Index:
solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
===================================================================
---
solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
     (revision 1526990)
+++
solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
     (working copy)
@@ -99,13 +99,13 @@
     runFullImport(getConfigHTML("identity"));
     assertQ(req("*:*"), testsHTMLIdentity);
   }
-
+
   private String getConfigHTML(String htmlMapper) {
     return
         "<dataConfig>" +
             "  <dataSource type='BinFileDataSource'/>" +
             "  <document>" +
-            "    <entity name='Tika' format='xml'
processor='TikaEntityProcessor' " +
+            "    <entity name='Tika' format='html'
processor='TikaEntityProcessor' " +
             "       url='" +
getFile("dihextras/structured.html").getAbsolutePath() + "' " +
             ((htmlMapper == null) ? "" : (" htmlMapper='" + htmlMapper +
"'")) + ">" +
             "      <field column='text'/>" +
@@ -114,4 +114,36 @@
             "</dataConfig>";

   }
+  private String[] testsHTMLH1 = {
+      "//*[@numFound='1']"
+      , "//str[@name='h1'][contains(.,'H1 Header')]"
+  };
+
+  @Test
+  public void testTikaHTMLMapperSubEntity() throws Exception {
+    runFullImport(getConfigSubEntity("identity"));
+    assertQ(req("*:*"), testsHTMLH1);
+  }
+
+  private String getConfigSubEntity(String htmlMapper) {
+    return
+        "<dataConfig>" +
+        "<dataSource type='BinFileDataSource' name='bin'/>" +
+        "<dataSource type='FieldStreamDataSource' name='fld'/>" +
+        "<document>" +
+        "<entity name='tika' processor='TikaEntityProcessor' url='" +
getFile("dihextras/structured.html").getAbsolutePath() + "'
dataSource='bin' format='html' rootEntity='false'>" +
+        "<!--Do appropriate mapping here  meta=\"true\" means it is a
metadata field -->" +
+        "<field column='Author' meta='true' name='author'/>" +
+        "<field column='title' meta='true' name='title'/>" +
+        "<!--'text' is an implicit field emited by TikaEntityProcessor .
Map it appropriately-->" +
+        "<field name='text' column='text'/>" +
+        "<entity name='detail' type='XPathEntityProcessor' forEach='/html'
dataSource='fld' dataField='tika.text' rootEntity='true' >" +
+        "<field xpath='//div'  column='foo'/>" +
+        "<field xpath='//h1'  column='h1' />" +
+        "</entity>" +
+        "</entity>" +
+        "</document>" +
+        "</dataConfig>";
+  }
+
 }
Index:
solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
===================================================================
---
solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
   (revision 1526990)
+++
solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
   (working copy)
@@ -194,6 +194,8 @@
    <field name="title" type="string" indexed="true" stored="true"/>
    <field name="author" type="string" indexed="true" stored="true" />
    <field name="text" type="text" indexed="true" stored="true" />
+   <field name="h1" type="text" indexed="true" stored="true" />
+   <field name="foo" type="text" indexed="true" stored="true" />

  </fields>
  <!-- field for the QueryParser to use when an explicit fieldname is
absent -->


I find the SqlEntityProcessor part particularly odd.  That's the default
right?:
2405 T12 C1 oashd.SqlEntityProcessor.initQuery ERROR The query failed
'null' java.lang.RuntimeException: unsupported type : class java.lang.String
at
org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:89)
 at
org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:1)
at
org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
 at
org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
at
org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
 at
org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:469)
at
org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:495)
 at
org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:408)
at
org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:323)
 at
org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:231)
at
org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:411)
 at
org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:476)
at
org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
 at
org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:1859)
 at org.apache.solr.util.TestHarness.query(TestHarness.java:291)
at
org.apache.solr.handler.dataimport.AbstractDataImportHandlerTestCase.runFullImport(AbstractDataImportHandlerTestCase.java:96)
 at
org.apache.solr.handler.dataimport.TestTikaEntityProcessor.testTikaHTMLMapperSubEntity(TestTikaEntityProcessor.java:124)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
 at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
 at java.lang.reflect.Method.invoke(Method.java:601)
at
com.carrotsearch.randomizedtesting.RandomizedRunner.invoke(RandomizedRunner.java:1559)
 at
com.carrotsearch.randomizedtesting.RandomizedRunner.access$600(RandomizedRunner.java:79)
at
com.carrotsearch.randomizedtesting.RandomizedRunner$6.evaluate(RandomizedRunner.java:737)
 at
com.carrotsearch.randomizedtesting.RandomizedRunner$7.evaluate(RandomizedRunner.java:773)
at
com.carrotsearch.randomizedtesting.RandomizedRunner$8.evaluate(RandomizedRunner.java:787)
 at
com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule$1.evaluate(SystemPropertiesRestoreRule.java:53)
at
org.apache.lucene.util.TestRuleSetupTeardownChained$1.evaluate(TestRuleSetupTeardownChained.java:50)
 at
org.apache.lucene.util.TestRuleFieldCacheSanity$1.evaluate(TestRuleFieldCacheSanity.java:51)
at
org.apache.lucene.util.AbstractBeforeAfterRule$1.evaluate(AbstractBeforeAfterRule.java:46)
 at
com.carrotsearch.randomizedtesting.rules.SystemPropertiesInvariantRule$1.evaluate(SystemPropertiesInvariantRule.java:55)
at
org.apache.lucene.util.TestRuleThreadAndTestName$1.evaluate(TestRuleThreadAndTestName.java:49)
 at
org.apache.lucene.util.TestRuleIgnoreAfterMaxFailures$1.evaluate(TestRuleIgnoreAfterMaxFailures.java:70)
at
org.apache.lucene.util.TestRuleMarkFailure$1.evaluate(TestRuleMarkFailure.java:48)
 at
com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
at
com.carrotsearch.randomizedtesting.ThreadLeakControl$StatementRunner.run(ThreadLeakControl.java:358)
 at
com.carrotsearch.randomizedtesting.ThreadLeakControl.forkTimeoutingTask(ThreadLeakControl.java:782)
at
com.carrotsearch.randomizedtesting.ThreadLeakControl$3.evaluate(ThreadLeakControl.java:442)
 at
com.carrotsearch.randomizedtesting.RandomizedRunner.runSingleTest(RandomizedRunner.java:746)
at
com.carrotsearch.randomizedtesting.RandomizedRunner$3.evaluate(RandomizedRunner.java:648)
 at
com.carrotsearch.randomizedtesting.RandomizedRunner$4.evaluate(RandomizedRunner.java:682)
at
com.carrotsearch.randomizedtesting.RandomizedRunner$5.evaluate(RandomizedRunner.java:693)
 at
com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
at
com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule$1.evaluate(SystemPropertiesRestoreRule.java:53)
 at
org.apache.lucene.util.AbstractBeforeAfterRule$1.evaluate(AbstractBeforeAfterRule.java:46)
at
org.apache.lucene.util.TestRuleStoreClassName$1.evaluate(TestRuleStoreClassName.java:42)
 at
com.carrotsearch.randomizedtesting.rules.SystemPropertiesInvariantRule$1.evaluate(SystemPropertiesInvariantRule.java:55)
at
com.carrotsearch.randomizedtesting.rules.NoShadowingOrOverridesOnMethodsRule$1.evaluate(NoShadowingOrOverridesOnMethodsRule.java:39)
 at
com.carrotsearch.randomizedtesting.rules.NoShadowingOrOverridesOnMethodsRule$1.evaluate(NoShadowingOrOverridesOnMethodsRule.java:39)
at
com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
 at
org.apache.lucene.util.TestRuleAssertionsRequired$1.evaluate(TestRuleAssertionsRequired.java:43)
at
org.apache.lucene.util.TestRuleMarkFailure$1.evaluate(TestRuleMarkFailure.java:48)
 at
org.apache.lucene.util.TestRuleIgnoreAfterMaxFailures$1.evaluate(TestRuleIgnoreAfterMaxFailures.java:70)
at
org.apache.lucene.util.TestRuleIgnoreTestSuites$1.evaluate(TestRuleIgnoreTestSuites.java:55)
 at
com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
at
com.carrotsearch.randomizedtesting.ThreadLeakControl$StatementRunner.run(ThreadLeakControl.java:358)
 at java.lang.Thread.run(Thread.java:722)



On Fri, Sep 27, 2013 at 3:55 AM, Andreas Owen <ao...@conx.ch> wrote:

> i removed the FieldReaderDataSource and dataSource="fld" but it didn't
> help. i get the following for each document:
>         DataImportHandlerException: Exception in invoking url null
> Processing Document # 9
>         nullpointerexception
>
>
> On 26. Sep 2013, at 8:39 PM, P Williams wrote:
>
> > Hi,
> >
> > Haven't tried this myself but maybe try leaving out the
> > FieldReaderDataSource entirely.  From my quick searching looks like it's
> > tied to SQL.  Did you try copying the
> > http://wiki.apache.org/solr/TikaEntityProcessor Advanced Parsing example
> > exactly?  What happens when you leave out FieldReaderDataSource?
> >
> > Cheers,
> > Tricia
> >
> >
> > On Thu, Sep 26, 2013 at 4:17 AM, Andreas Owen <ao...@conx.ch> wrote:
> >
> >> i'm using solr 4.3.1 and the dataimporter. i am trying to use
> >> XPathEntityProcessor within the TikaEntityProcessor for indexing
> html-pages
> >> but i'm getting this error for each document. i have also tried
> >> dataField="tika.text" and dataField="text" to no avail. the nested
> >> XPathEntityProcessor "detail" creates the error, the rest works fine.
> what
> >> am i doing wrong?
> >>
> >> error:
> >>
> >> ERROR - 2013-09-26 12:08:49.006;
> >> org.apache.solr.handler.dataimport.SqlEntityProcessor; The query failed
> >> 'null'
> >> java.lang.ClassCastException: java.io.StringReader cannot be cast to
> >> java.util.Iterator
> >>        at
> >>
> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
> >>        at
> >>
> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
> >>        at
> >>
> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
> >>        at
> >>
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
> >>        at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
> >>        at
> >>
> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
> >>        at
> >>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
> >>        at
> >>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
> >>        at
> >>
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
> >>        at
> >>
> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
> >>        at
> >>
> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
> >>        at
> >>
> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
> >>        at
> >>
> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
> >>        at
> >>
> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
> >>        at
> >>
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
> >>        at
> >>
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
> >>        at org.eclipse.jetty.server.Server.handle(Server.java:365)
> >>        at
> >>
> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
> >>        at
> >>
> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
> >>        at
> >>
> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
> >>        at
> >>
> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
> >>        at
> org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
> >>        at
> >> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
> >>        at
> >>
> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
> >>        at
> >>
> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
> >>        at
> >>
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
> >>        at
> >>
> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
> >>        at java.lang.Thread.run(Unknown Source)
> >> ERROR - 2013-09-26 12:08:49.022; org.apache.solr.common.SolrException;
> >> Exception in entity :
> >> detail:org.apache.solr.handler.dataimport.DataImportHandlerException:
> >> java.lang.ClassCastException: java.io.StringReader cannot be cast to
> >> java.util.Iterator
> >>        at
> >>
> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:65)
> >>        at
> >>
> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
> >>        at
> >>
> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
> >>        at
> >>
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
> >>        at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
> >>        at
> >>
> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
> >>        at
> >>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
> >>        at
> >>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
> >>        at
> >>
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
> >>        at
> >>
> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
> >>        at
> >>
> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
> >>        at
> >>
> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
> >>        at
> >>
> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
> >>        at
> >>
> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
> >>        at
> >>
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
> >>        at
> >>
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
> >>        at org.eclipse.jetty.server.Server.handle(Server.java:365)
> >>        at
> >>
> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
> >>        at
> >>
> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
> >>        at
> >>
> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
> >>        at
> >>
> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
> >>        at
> org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
> >>        at
> >> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
> >>        at
> >>
> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
> >>        at
> >>
> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
> >>        at
> >>
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
> >>        at
> >>
> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
> >>        at java.lang.Thread.run(Unknown Source)
> >> Caused by: java.lang.ClassCastException: java.io.StringReader cannot be
> >> cast to java.util.Iterator
> >>        at
> >>
> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
> >>        ... 41 more
> >>
> >>
> >>
> >> data-config.xml
> >>
> >> <dataConfig>
> >>        <dataSource type="BinURLDataSource" name="dataFile"/>
> >>        <dataSource type="BinURLDataSource" name="dataUrl"/>
> >>        <dataSource type="URLDataSource" name="main"/>
> >>        <dataSource type="FieldReaderDataSource" name="fld"/>
> >> <document>
> >> <entity name="rec" processor="XPathEntityProcessor"
> >>
> url="file:///C:\ColdFusion10\cfusion\solr\solr\tkbintranet\docImportUrl.xml"
> >> forEach="/docs/doc" dataSource="main">
> >>                <field column="title" xpath="//title" />
> >>                <field column="id" xpath="//id" />
> >>                <field column="file" xpath="//file" />
> >>                <field column="url" xpath="//url" />
> >>                <field column="urlParse" xpath="//urlParse" />
> >>                <field column="last_modified" xpath="//last_modified" />
> >>                <field column="Author" xpath="//author" />
> >>
> >>                <entity name="tika" processor="TikaEntityProcessor"
> >> url="${rec.urlParse}" dataSource="dataUrl" onError="skip" format="html">
> >>                        <field column="text"/>
> >>
> >>                        <entity name="detail" type="XPathEntityProcessor"
> >> forEach="/html" dataSource="fld" dataField="${tika.text}"
> rootEntity="true"
> >> onError="skip">
> >>                                <field xpath="//h1" column="h_1" />
> >>                        </entity>
> >>                </entity>
> >>        </entity>
> >> </document>
> >> </dataConfig>
>
>

Re: XPathEntityProcessor nested in TikaEntityProcessor query null exception

Posted by Andreas Owen <ao...@conx.ch>.
i removed the FieldReaderDataSource and dataSource="fld" but it didn't help. i get the following for each document:
	DataImportHandlerException: Exception in invoking url null Processing Document # 9
	nullpointerexception


On 26. Sep 2013, at 8:39 PM, P Williams wrote:

> Hi,
> 
> Haven't tried this myself but maybe try leaving out the
> FieldReaderDataSource entirely.  From my quick searching looks like it's
> tied to SQL.  Did you try copying the
> http://wiki.apache.org/solr/TikaEntityProcessor Advanced Parsing example
> exactly?  What happens when you leave out FieldReaderDataSource?
> 
> Cheers,
> Tricia
> 
> 
> On Thu, Sep 26, 2013 at 4:17 AM, Andreas Owen <ao...@conx.ch> wrote:
> 
>> i'm using solr 4.3.1 and the dataimporter. i am trying to use
>> XPathEntityProcessor within the TikaEntityProcessor for indexing html-pages
>> but i'm getting this error for each document. i have also tried
>> dataField="tika.text" and dataField="text" to no avail. the nested
>> XPathEntityProcessor "detail" creates the error, the rest works fine. what
>> am i doing wrong?
>> 
>> error:
>> 
>> ERROR - 2013-09-26 12:08:49.006;
>> org.apache.solr.handler.dataimport.SqlEntityProcessor; The query failed
>> 'null'
>> java.lang.ClassCastException: java.io.StringReader cannot be cast to
>> java.util.Iterator
>>        at
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
>>        at
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
>>        at
>> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
>>        at
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
>>        at
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>>        at
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>>        at
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
>>        at
>> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
>>        at
>> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
>>        at
>> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
>>        at
>> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
>>        at
>> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
>>        at
>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
>>        at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
>>        at
>> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
>>        at
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
>>        at
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
>>        at
>> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
>>        at
>> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
>>        at
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
>>        at
>> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
>>        at
>> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
>>        at
>> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
>>        at
>> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
>>        at
>> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
>>        at
>> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
>>        at
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
>>        at
>> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
>>        at
>> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
>>        at
>> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
>>        at org.eclipse.jetty.server.Server.handle(Server.java:365)
>>        at
>> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
>>        at
>> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
>>        at
>> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
>>        at
>> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
>>        at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
>>        at
>> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
>>        at
>> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
>>        at
>> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
>>        at
>> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
>>        at
>> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
>>        at java.lang.Thread.run(Unknown Source)
>> ERROR - 2013-09-26 12:08:49.022; org.apache.solr.common.SolrException;
>> Exception in entity :
>> detail:org.apache.solr.handler.dataimport.DataImportHandlerException:
>> java.lang.ClassCastException: java.io.StringReader cannot be cast to
>> java.util.Iterator
>>        at
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:65)
>>        at
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
>>        at
>> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
>>        at
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
>>        at
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>>        at
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>>        at
>> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
>>        at
>> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
>>        at
>> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
>>        at
>> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
>>        at
>> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
>>        at
>> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
>>        at
>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
>>        at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
>>        at
>> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
>>        at
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
>>        at
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
>>        at
>> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
>>        at
>> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
>>        at
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
>>        at
>> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
>>        at
>> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
>>        at
>> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
>>        at
>> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
>>        at
>> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
>>        at
>> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
>>        at
>> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
>>        at
>> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
>>        at
>> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
>>        at
>> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
>>        at org.eclipse.jetty.server.Server.handle(Server.java:365)
>>        at
>> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
>>        at
>> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
>>        at
>> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
>>        at
>> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
>>        at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
>>        at
>> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
>>        at
>> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
>>        at
>> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
>>        at
>> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
>>        at
>> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
>>        at java.lang.Thread.run(Unknown Source)
>> Caused by: java.lang.ClassCastException: java.io.StringReader cannot be
>> cast to java.util.Iterator
>>        at
>> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
>>        ... 41 more
>> 
>> 
>> 
>> data-config.xml
>> 
>> <dataConfig>
>>        <dataSource type="BinURLDataSource" name="dataFile"/>
>>        <dataSource type="BinURLDataSource" name="dataUrl"/>
>>        <dataSource type="URLDataSource" name="main"/>
>>        <dataSource type="FieldReaderDataSource" name="fld"/>
>> <document>
>> <entity name="rec" processor="XPathEntityProcessor"
>> url="file:///C:\ColdFusion10\cfusion\solr\solr\tkbintranet\docImportUrl.xml"
>> forEach="/docs/doc" dataSource="main">
>>                <field column="title" xpath="//title" />
>>                <field column="id" xpath="//id" />
>>                <field column="file" xpath="//file" />
>>                <field column="url" xpath="//url" />
>>                <field column="urlParse" xpath="//urlParse" />
>>                <field column="last_modified" xpath="//last_modified" />
>>                <field column="Author" xpath="//author" />
>> 
>>                <entity name="tika" processor="TikaEntityProcessor"
>> url="${rec.urlParse}" dataSource="dataUrl" onError="skip" format="html">
>>                        <field column="text"/>
>> 
>>                        <entity name="detail" type="XPathEntityProcessor"
>> forEach="/html" dataSource="fld" dataField="${tika.text}" rootEntity="true"
>> onError="skip">
>>                                <field xpath="//h1" column="h_1" />
>>                        </entity>
>>                </entity>
>>        </entity>
>> </document>
>> </dataConfig>


Re: XPathEntityProcessor nested in TikaEntityProcessor query null exception

Posted by P Williams <wi...@gmail.com>.
Hi,

Haven't tried this myself but maybe try leaving out the
FieldReaderDataSource entirely.  From my quick searching looks like it's
tied to SQL.  Did you try copying the
http://wiki.apache.org/solr/TikaEntityProcessor Advanced Parsing example
exactly?  What happens when you leave out FieldReaderDataSource?

Cheers,
Tricia


On Thu, Sep 26, 2013 at 4:17 AM, Andreas Owen <ao...@conx.ch> wrote:

> i'm using solr 4.3.1 and the dataimporter. i am trying to use
> XPathEntityProcessor within the TikaEntityProcessor for indexing html-pages
> but i'm getting this error for each document. i have also tried
> dataField="tika.text" and dataField="text" to no avail. the nested
> XPathEntityProcessor "detail" creates the error, the rest works fine. what
> am i doing wrong?
>
> error:
>
> ERROR - 2013-09-26 12:08:49.006;
> org.apache.solr.handler.dataimport.SqlEntityProcessor; The query failed
> 'null'
> java.lang.ClassCastException: java.io.StringReader cannot be cast to
> java.util.Iterator
>         at
> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
>         at
> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
>         at
> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
>         at
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
>         at
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>         at
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>         at
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
>         at
> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
>         at
> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
>         at
> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
>         at
> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
>         at
> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
>         at
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
>         at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
>         at
> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
>         at
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
>         at
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
>         at
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
>         at
> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
>         at
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
>         at
> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
>         at
> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
>         at
> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
>         at
> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
>         at
> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
>         at
> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
>         at
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
>         at
> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
>         at
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
>         at
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
>         at org.eclipse.jetty.server.Server.handle(Server.java:365)
>         at
> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
>         at
> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
>         at
> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
>         at
> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
>         at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
>         at
> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
>         at
> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
>         at
> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
>         at
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
>         at
> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
>         at java.lang.Thread.run(Unknown Source)
> ERROR - 2013-09-26 12:08:49.022; org.apache.solr.common.SolrException;
> Exception in entity :
> detail:org.apache.solr.handler.dataimport.DataImportHandlerException:
> java.lang.ClassCastException: java.io.StringReader cannot be cast to
> java.util.Iterator
>         at
> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:65)
>         at
> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
>         at
> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
>         at
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
>         at
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>         at
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
>         at
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
>         at
> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
>         at
> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
>         at
> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
>         at
> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
>         at
> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
>         at
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
>         at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
>         at
> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
>         at
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
>         at
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
>         at
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
>         at
> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
>         at
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
>         at
> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
>         at
> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
>         at
> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
>         at
> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
>         at
> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
>         at
> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
>         at
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
>         at
> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
>         at
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
>         at
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
>         at org.eclipse.jetty.server.Server.handle(Server.java:365)
>         at
> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
>         at
> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
>         at
> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
>         at
> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
>         at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
>         at
> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
>         at
> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
>         at
> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
>         at
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
>         at
> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
>         at java.lang.Thread.run(Unknown Source)
> Caused by: java.lang.ClassCastException: java.io.StringReader cannot be
> cast to java.util.Iterator
>         at
> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
>         ... 41 more
>
>
>
> data-config.xml
>
> <dataConfig>
>         <dataSource type="BinURLDataSource" name="dataFile"/>
>         <dataSource type="BinURLDataSource" name="dataUrl"/>
>         <dataSource type="URLDataSource" name="main"/>
>         <dataSource type="FieldReaderDataSource" name="fld"/>
> <document>
> <entity name="rec" processor="XPathEntityProcessor"
> url="file:///C:\ColdFusion10\cfusion\solr\solr\tkbintranet\docImportUrl.xml"
> forEach="/docs/doc" dataSource="main">
>                 <field column="title" xpath="//title" />
>                 <field column="id" xpath="//id" />
>                 <field column="file" xpath="//file" />
>                 <field column="url" xpath="//url" />
>                 <field column="urlParse" xpath="//urlParse" />
>                 <field column="last_modified" xpath="//last_modified" />
>                 <field column="Author" xpath="//author" />
>
>                 <entity name="tika" processor="TikaEntityProcessor"
> url="${rec.urlParse}" dataSource="dataUrl" onError="skip" format="html">
>                         <field column="text"/>
>
>                         <entity name="detail" type="XPathEntityProcessor"
> forEach="/html" dataSource="fld" dataField="${tika.text}" rootEntity="true"
> onError="skip">
>                                 <field xpath="//h1" column="h_1" />
>                         </entity>
>                 </entity>
>         </entity>
> </document>
> </dataConfig>