You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by "István Bakró Nagy (JIRA)" <ji...@apache.org> on 2015/06/12 12:37:01 UTC

[jira] [Updated] (SOLR-7670) solr import files from multiple dataSource entity

     [ https://issues.apache.org/jira/browse/SOLR-7670?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

István Bakró Nagy updated SOLR-7670:
------------------------------------
    Description: 
I am trying to import files from multiple folders.

My solrconfig.xml invokes the following file to use it with org.apache.solr.handler.dataimport.DataImportHandler.


{{<dataConfig>  
    <dataSource type="BinFileDataSource" />
        <document>
            <entity name="files1"
                    dataSource="null"
                    rootEntity="false"
                    processor="FileListEntityProcessor"
                    baseDir="/w/PDF/"
                    fileName=".*\.(pdf)|(doc)|(docx)|(ppt)|(pptx)|(xls)|(xlsx)|(odf)|(txt)|(rtf)|(html)|(htm)|(jpg)"
                    onError="skip"
                    recursive="true">

                <field column="fileAbsolutePath" name="id" />
                <field column="fileSize" name="size" />
                <field column="fileLastModified" name="lastModified" />
                <field column="file" name="fileName"/>

                <entity
                    name="documentImport1"
                    processor="TikaEntityProcessor"
                    url="${files.fileAbsolutePath}"
                    format="text">
                    <field column="file" name="fileName"/>
                    <field column="Author" name="author" meta="true"/>
                    <field column="title" name="title" meta="true"/>
                    <field column="text" name="text"/>
                    <copyField source="content" dest="text"/>

                </entity>
            </entity>

            <entity name="files2"
                    dataSource="null"
                    rootEntity="false"
                    processor="FileListEntityProcessor"
                    baseDir="/w/KNOW-HOW/"
                    fileName=".*\.(pdf)|(doc)|(docx)|(ppt)|(pptx)|(xls)|(xlsx)|(odf)|(txt)|(rtf)|(html)|(htm)|(jpg)"
                    onError="skip"
                    recursive="true">

                <field column="fileAbsolutePath" name="id" />
                <field column="fileSize" name="size" />
                <field column="fileLastModified" name="lastModified" />
                <field column="file" name="fileName"/>

                <entity
                    name="documentImport2"
                    processor="TikaEntityProcessor"
                    url="${files.fileAbsolutePath}"
                    format="text">
                    <field column="file" name="fileName"/>
                    <field column="Author" name="author" meta="true"/>
                    <field column="title" name="title" meta="true"/>
                    <field column="text" name="text"/>
                    <copyField source="content" dest="text"/>

                </entity>
            </entity>
        </document> 
</dataConfig>}}

During import I get a FileNotFoundException.

What am I missing?

  was:
I am trying to import files from multiple folders.

My solrconfig.xml invokes the following file to use it with org.apache.solr.handler.dataimport.DataImportHandler.

<dataConfig>  
    <dataSource type="BinFileDataSource" />
        <document>
            <entity name="files1"
                    dataSource="null"
                    rootEntity="false"
                    processor="FileListEntityProcessor"
                    baseDir="/w/PDF/"
                    fileName=".*\.(pdf)|(doc)|(docx)|(ppt)|(pptx)|(xls)|(xlsx)|(odf)|(txt)|(rtf)|(html)|(htm)|(jpg)"
                    onError="skip"
                    recursive="true">

                <field column="fileAbsolutePath" name="id" />
                <field column="fileSize" name="size" />
                <field column="fileLastModified" name="lastModified" />
                <field column="file" name="fileName"/>

                <entity
                    name="documentImport1"
                    processor="TikaEntityProcessor"
                    url="${files.fileAbsolutePath}"
                    format="text">
                    <field column="file" name="fileName"/>
                    <field column="Author" name="author" meta="true"/>
                    <field column="title" name="title" meta="true"/>
                    <field column="text" name="text"/>
                    <copyField source="content" dest="text"/>

                </entity>
            </entity>

            <entity name="files2"
                    dataSource="null"
                    rootEntity="false"
                    processor="FileListEntityProcessor"
                    baseDir="/w/KNOW-HOW/"
                    fileName=".*\.(pdf)|(doc)|(docx)|(ppt)|(pptx)|(xls)|(xlsx)|(odf)|(txt)|(rtf)|(html)|(htm)|(jpg)"
                    onError="skip"
                    recursive="true">

                <field column="fileAbsolutePath" name="id" />
                <field column="fileSize" name="size" />
                <field column="fileLastModified" name="lastModified" />
                <field column="file" name="fileName"/>

                <entity
                    name="documentImport2"
                    processor="TikaEntityProcessor"
                    url="${files.fileAbsolutePath}"
                    format="text">
                    <field column="file" name="fileName"/>
                    <field column="Author" name="author" meta="true"/>
                    <field column="title" name="title" meta="true"/>
                    <field column="text" name="text"/>
                    <copyField source="content" dest="text"/>

                </entity>
            </entity>
        </document> 
</dataConfig>  
During import I get a FileNotFoundException.

What am I missing?


> solr import files from multiple dataSource entity
> -------------------------------------------------
>
>                 Key: SOLR-7670
>                 URL: https://issues.apache.org/jira/browse/SOLR-7670
>             Project: Solr
>          Issue Type: Bug
>    Affects Versions: 5.1
>            Reporter: István Bakró Nagy
>            Priority: Minor
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> I am trying to import files from multiple folders.
> My solrconfig.xml invokes the following file to use it with org.apache.solr.handler.dataimport.DataImportHandler.
> {{<dataConfig>  
>     <dataSource type="BinFileDataSource" />
>         <document>
>             <entity name="files1"
>                     dataSource="null"
>                     rootEntity="false"
>                     processor="FileListEntityProcessor"
>                     baseDir="/w/PDF/"
>                     fileName=".*\.(pdf)|(doc)|(docx)|(ppt)|(pptx)|(xls)|(xlsx)|(odf)|(txt)|(rtf)|(html)|(htm)|(jpg)"
>                     onError="skip"
>                     recursive="true">
>                 <field column="fileAbsolutePath" name="id" />
>                 <field column="fileSize" name="size" />
>                 <field column="fileLastModified" name="lastModified" />
>                 <field column="file" name="fileName"/>
>                 <entity
>                     name="documentImport1"
>                     processor="TikaEntityProcessor"
>                     url="${files.fileAbsolutePath}"
>                     format="text">
>                     <field column="file" name="fileName"/>
>                     <field column="Author" name="author" meta="true"/>
>                     <field column="title" name="title" meta="true"/>
>                     <field column="text" name="text"/>
>                     <copyField source="content" dest="text"/>
>                 </entity>
>             </entity>
>             <entity name="files2"
>                     dataSource="null"
>                     rootEntity="false"
>                     processor="FileListEntityProcessor"
>                     baseDir="/w/KNOW-HOW/"
>                     fileName=".*\.(pdf)|(doc)|(docx)|(ppt)|(pptx)|(xls)|(xlsx)|(odf)|(txt)|(rtf)|(html)|(htm)|(jpg)"
>                     onError="skip"
>                     recursive="true">
>                 <field column="fileAbsolutePath" name="id" />
>                 <field column="fileSize" name="size" />
>                 <field column="fileLastModified" name="lastModified" />
>                 <field column="file" name="fileName"/>
>                 <entity
>                     name="documentImport2"
>                     processor="TikaEntityProcessor"
>                     url="${files.fileAbsolutePath}"
>                     format="text">
>                     <field column="file" name="fileName"/>
>                     <field column="Author" name="author" meta="true"/>
>                     <field column="title" name="title" meta="true"/>
>                     <field column="text" name="text"/>
>                     <copyField source="content" dest="text"/>
>                 </entity>
>             </entity>
>         </document> 
> </dataConfig>}}
> During import I get a FileNotFoundException.
> What am I missing?



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: dev-help@lucene.apache.org