You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by txlap786 <tx...@hotmail.com> on 2017/07/21 11:25:36 UTC
Tika-DIH (zip to xml)
# I am trying to extract zip files(which have xml files in it) using DIH
# I can get data and index them from xml like this
<dataConfig>
<dataSource encoding="UTF-8" type="FileDataSource"/>
<document>
<entity
name="pickupdir"
processor="FileListEntityProcessor"
rootEntity="false"
dataSource="null"
fileName=".*xml"
baseDir="${solr.install.dir}\example\exampledocs\myFiles"
recursive="true"
>
<entity
name="xml"
processor="XPathEntityProcessor"
datasource="pickupdir"
stream="false"
forEach="/accountingEntries/entryHeader"
url="${pickupdir.fileAbsolutePath}"
rootEntity='true'
pk="lineNumber"
onError="skip"
>
<field column="entryNumberCounter"
xpath="/accountingEntries/entryHeader/entryNumberCounter"/>
<field column="entryNumber"
xpath="/accountingEntries/entryHeader/entryNumber"/>
<field column="enteredDate"
xpath="/accountingEntries/entryHeader/enteredDate"/>
<field column="totalDebit"
xpath="/accountingEntries/entryHeader/totalDebit"/>
<field column="totalCredit"
xpath="/accountingEntries/entryHeader/totalCredit"/>
<field column="lineNumber"
xpath="/accountingEntries/entryHeader/entryDetail/lineNumber"/>
</entity>
</entity>
</document>
</dataConfig>
# How can i add TikaEntityProcessor ? I tried like this way
<dataConfig>
<dataSource encoding="UTF-8" type="BinFileDataSource"/>
<document>
<entity
name="pickupdir"
processor="FileListEntityProcessor"
rootEntity="false"
dataSource="null"
fileName=".*zip"
baseDir="${solr.install.dir}\example\exampledocs\myFiles"
recursive="false"
>
<entity
name="ext"
processor="TikaEntityProcessor"
url="${pickupdir.fileAbsolutePath}"
format="xml"
rootEntity="false"
>
<entity
name="xml"
processor="XPathEntityProcessor"
datasource="pickupdir"
stream="false"
forEach="/accountingEntries/entryHeader"
url="${pickupdir.fileAbsolutePath}"
rootEntity='true'
pk="lineNumber"
onError="skip"
>
<field column="entryNumberCounter"
xpath="/accountingEntries/entryHeader/entryNumberCounter"/>
<field column="entryNumber"
xpath="/accountingEntries/entryHeader/entryNumber"/>
<field column="enteredDate"
xpath="/accountingEntries/entryHeader/enteredDate"/>
<field column="totalDebit"
xpath="/accountingEntries/entryHeader/totalDebit"/>
<field column="totalCredit"
xpath="/accountingEntries/entryHeader/totalCredit"/>
<field column="lineNumber"
xpath="/accountingEntries/entryHeader/entryDetail/lineNumber"/>
</entity>
</entity>
</entity>
</document>
</dataConfig>
# And this is the result i got
"----------- row #1-------------",
"file",
"YYY.zip",
"fileSize",
1124851,
"fileLastModified",
"2017-07-21T08:18:23.085Z",
"fileDir",
"C:\\Users\\USER\\Desktop\\solr-6.6.0\\example\\exampledocs\\myFiles",
"fileAbsolutePath",
"C:\\Users\\USER\\Desktop\\solr-6.6.0\\example\\exampledocs\\myFiles\\YYY.zip",
null,
"---------------------------------------------",
"entity:ext",
[
"query",
"C:\\Users\\USER\\Desktop\\solr-6.6.0\\example\\exampledocs\\myFiles\\YYY.zip",
"time-taken",
"0:0:0.0",
null,
"----------- row #1-------------",
"text",
"<?xml version=\"1.0\" encoding=\"UTF-8\"?><html
xmlns=\"http://www.w3.org/1999/xhtml\">\r\n<head>\r\n<meta
name=\"X-Parsed-By\"
content=\"org.apache.tika.parser.DefaultParser\"/>\r\n<meta
name=\"X-Parsed-By\"
content=\"org.apache.tika.parser.pkg.PackageParser\"/>\r\n<meta
name=\"Content-Type\"
content=\"application/zip\"/>\r\n<title/>\r\n</head>\r\n<body><div
class=\"embedded\" id=\"9860029035-201601-Y-000000.xml\"/>\r\n<div
class=\"package-entry\">
9860029035-201601-Y-000000.xml
\r\n</div>\r\n</body></html>",
null,
"---------------------------------------------",
"entity:xml",
[
"document#1",
[
"query",
"C:\\Users\\USER\\Desktop\\solr-6.6.0\\example\\exampledocs\\myFiles\\YYY.zip",
"time-taken",
"0:0:0.0"
# Please explain how it works
--
View this message in context: http://lucene.472066.n3.nabble.com/Tika-DIH-zip-to-xml-tp4347122.html
Sent from the Solr - User mailing list archive at Nabble.com.