You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by txlap786 <tx...@hotmail.com> on 2017/07/21 11:25:36 UTC

Tika-DIH (zip to xml)

# I am trying to extract zip files(which have xml files in it) using DIH

# I can get data and index them from xml like this

<dataConfig>
	<dataSource encoding="UTF-8" type="FileDataSource"/>
	
    <document>
        <entity
            name="pickupdir"
            processor="FileListEntityProcessor"
            rootEntity="false"
            dataSource="null"
            fileName=".*xml"
            baseDir="${solr.install.dir}\example\exampledocs\myFiles"
            recursive="true"
        >
				<entity 
					name="xml"
					processor="XPathEntityProcessor"
					datasource="pickupdir"
					stream="false"
					forEach="/accountingEntries/entryHeader"
					url="${pickupdir.fileAbsolutePath}"
					rootEntity='true'
					pk="lineNumber"
					onError="skip"
				>
					<field column="entryNumberCounter"
xpath="/accountingEntries/entryHeader/entryNumberCounter"/>
					<field column="entryNumber"
xpath="/accountingEntries/entryHeader/entryNumber"/>
					<field column="enteredDate"
xpath="/accountingEntries/entryHeader/enteredDate"/>
					<field column="totalDebit"
xpath="/accountingEntries/entryHeader/totalDebit"/>
					<field column="totalCredit"
xpath="/accountingEntries/entryHeader/totalCredit"/>
					<field column="lineNumber"
xpath="/accountingEntries/entryHeader/entryDetail/lineNumber"/>
				
			</entity>
        </entity>
    </document>
</dataConfig>

# How can i add TikaEntityProcessor ? I tried like this way

<dataConfig>
	<dataSource encoding="UTF-8" type="BinFileDataSource"/>
    
    <document>
        <entity
            name="pickupdir"
            processor="FileListEntityProcessor"
            rootEntity="false"
            dataSource="null"
            fileName=".*zip"
            baseDir="${solr.install.dir}\example\exampledocs\myFiles"
            recursive="false"
        >
			<entity 
				name="ext" 
				processor="TikaEntityProcessor"
				url="${pickupdir.fileAbsolutePath}" 
				format="xml"
				rootEntity="false"
			>
				<entity 
					name="xml"
					processor="XPathEntityProcessor"
					datasource="pickupdir"
					stream="false"
					forEach="/accountingEntries/entryHeader"
					url="${pickupdir.fileAbsolutePath}"
					rootEntity='true'
					pk="lineNumber"
					onError="skip"
				>
					<field column="entryNumberCounter"
xpath="/accountingEntries/entryHeader/entryNumberCounter"/>
					<field column="entryNumber"
xpath="/accountingEntries/entryHeader/entryNumber"/>
					<field column="enteredDate"
xpath="/accountingEntries/entryHeader/enteredDate"/>
					<field column="totalDebit"
xpath="/accountingEntries/entryHeader/totalDebit"/>
					<field column="totalCredit"
xpath="/accountingEntries/entryHeader/totalCredit"/>
					<field column="lineNumber"
xpath="/accountingEntries/entryHeader/entryDetail/lineNumber"/>
				
			</entity>
		</entity>
        </entity>
    </document>
</dataConfig>

# And this is the result i got

"----------- row #1-------------",
      "file",
      "YYY.zip",
      "fileSize",
      1124851,
      "fileLastModified",
      "2017-07-21T08:18:23.085Z",
      "fileDir",
      "C:\\Users\\USER\\Desktop\\solr-6.6.0\\example\\exampledocs\\myFiles",
      "fileAbsolutePath",
     
"C:\\Users\\USER\\Desktop\\solr-6.6.0\\example\\exampledocs\\myFiles\\YYY.zip",
      null,
      "---------------------------------------------",
      "entity:ext",
      [
        "query",
       
"C:\\Users\\USER\\Desktop\\solr-6.6.0\\example\\exampledocs\\myFiles\\YYY.zip",
        "time-taken",
        "0:0:0.0",
        null,
        "----------- row #1-------------",
        "text",
        "<?xml version=\"1.0\" encoding=\"UTF-8\"?><html
xmlns=\"http://www.w3.org/1999/xhtml\">\r\n<head>\r\n<meta
name=\"X-Parsed-By\"
content=\"org.apache.tika.parser.DefaultParser\"/>\r\n<meta
name=\"X-Parsed-By\"
content=\"org.apache.tika.parser.pkg.PackageParser\"/>\r\n<meta
name=\"Content-Type\"
content=\"application/zip\"/>\r\n<title/>\r\n</head>\r\n<body><div
class=\"embedded\" id=\"9860029035-201601-Y-000000.xml\"/>\r\n<div
class=\"package-entry\">
9860029035-201601-Y-000000.xml
\r\n</div>\r\n</body></html>",
        null,
        "---------------------------------------------",
        "entity:xml",
        [
          "document#1",
          [
            "query",
           
"C:\\Users\\USER\\Desktop\\solr-6.6.0\\example\\exampledocs\\myFiles\\YYY.zip",
            "time-taken",
            "0:0:0.0"

# Please explain how it works



--
View this message in context: http://lucene.472066.n3.nabble.com/Tika-DIH-zip-to-xml-tp4347122.html
Sent from the Solr - User mailing list archive at Nabble.com.