You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by "Alexandre Rafalovitch (JIRA)" <ji...@apache.org> on 2016/10/05 01:39:20 UTC

[jira] [Closed] (SOLR-4809) OpenOffice document body is not indexed by SolrCell

     [ https://issues.apache.org/jira/browse/SOLR-4809?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Alexandre Rafalovitch closed SOLR-4809.
---------------------------------------
    Resolution: Implemented

This was a Tika issue, not Solr. And it was already implemented in Tika 1.5.

> OpenOffice document body is not indexed by SolrCell
> ---------------------------------------------------
>
>                 Key: SOLR-4809
>                 URL: https://issues.apache.org/jira/browse/SOLR-4809
>             Project: Solr
>          Issue Type: Bug
>          Components: contrib - Solr Cell (Tika extraction)
>    Affects Versions: 3.6.1, 4.3
>            Reporter: Jack Krupansky
>         Attachments: HelloWorld.docx, HelloWorld.odp, HelloWorld.odt, HelloWorld.txt, SOLR-4809.patch
>
>
> As reported on the solr user mailing list, SolrCell is not indexing document body content for OpenOffice documents.
> I tested with Apache Open Office 3.4.1 on Solr 4.3 and 3.6.1, for both OpenWriter (.ODT) and Impress (.ODS).
> The extractOnly option does return the document body text, but Solr does not index the document body text. In my test cases (.ODS and .ODT), all I see for the "content" attribute in Solr are a few spaces.
> Using the example schema, I indexed HelloWorld.odt using:
> {code}
>  curl "http://localhost:8983/solr/update/extract?literal.id=doc-1&uprefix=attr_&commit=true" -F "myfile=@HelloWorld.odt"
> {code}
> It queries as:
> {code}
> <?xml version="1.0" encoding="UTF-8"?>
> <response>
> <lst name="responseHeader">
>   <int name="status">0</int>
>   <int name="QTime">2</int>
>   <lst name="params">
>     <str name="indent">true</str>
>     <str name="q">id:doc-1</str>
>   </lst>
> </lst>
> <result name="response" numFound="1" start="0">
>   <doc>
>     <str name="id">doc-1</str>
>     <arr name="attr_image_count">
>       <str>0</str>
>     </arr>
>     <arr name="attr_editing_cycles">
>       <str>1</str>
>     </arr>
>     <arr name="attr_stream_source_info">
>       <str>myfile</str>
>     </arr>
>     <arr name="attr_meta_save_date">
>       <str>2013-05-10T17:15:40.99</str>
>     </arr>
>     <arr name="attr_dc_subject">
>       <str>Hello, World</str>
>     </arr>
>     <str name="subject">Hello World - subject</str>
>     <arr name="attr_dcterms_created">
>       <str>2013-05-10T17:11:58.88</str>
>     </arr>
>     <arr name="attr_date">
>       <str>2013-05-10T17:15:40.99</str>
>     </arr>
>     <arr name="attr_dc_description">
>       <str>This is a test of SolrCell using OpenOffice 3.4.1 - OpenWriter.</str>
>     </arr>
>     <arr name="attr_nbobject">
>       <str>0</str>
>     </arr>
>     <arr name="attr_word_count">
>       <str>10</str>
>     </arr>
>     <arr name="attr_edit_time">
>       <str>PT3M44S</str>
>     </arr>
>     <arr name="attr_meta_paragraph_count">
>       <str>4</str>
>     </arr>
>     <arr name="attr_creation_date">
>       <str>2013-05-10T17:11:58.88</str>
>     </arr>
>     <arr name="title">
>       <str>Hello World SolrCell Test - title</str>
>     </arr>
>     <arr name="attr_object_count">
>       <str>0</str>
>     </arr>
>     <arr name="attr_stream_content_type">
>       <str>application/octet-stream</str>
>     </arr>
>     <arr name="attr_nbimg">
>       <str>0</str>
>     </arr>
>     <str name="description">This is a test of SolrCell using OpenOffice 3.4.1 - OpenWriter.</str>
>     <arr name="attr_stream_size">
>       <str>8960</str>
>     </arr>
>     <arr name="attr_meta_object_count">
>       <str>0</str>
>     </arr>
>     <arr name="attr_cp_subject">
>       <str>Hello World - subject</str>
>     </arr>
>     <arr name="attr_stream_name">
>       <str>HelloWorld.odt</str>
>     </arr>
>     <arr name="attr_generator">
>       <str>OpenOffice.org/3.4.1$Win32 OpenOffice.org_project/341m1$Build-9593</str>
>     </arr>
>     <str name="keywords">Hello, World</str>
>     <arr name="attr_last_save_date">
>       <str>2013-05-10T17:15:40.99</str>
>     </arr>
>     <arr name="attr_paragraph_count">
>       <str>4</str>
>     </arr>
>     <arr name="attr_dc_title">
>       <str>Hello World SolrCell Test - title</str>
>     </arr>
>     <arr name="attr_dcterms_modified">
>       <str>2013-05-10T17:15:40.99</str>
>     </arr>
>     <arr name="attr_meta_creation_date">
>       <str>2013-05-10T17:11:58.88</str>
>     </arr>
>     <arr name="attr_page_count">
>       <str>1</str>
>     </arr>
>     <arr name="attr_meta_character_count">
>       <str>60</str>
>     </arr>
>     <date name="last_modified">2013-05-10T17:15:40Z</date>
>     <arr name="attr_nbtab">
>       <str>0</str>
>     </arr>
>     <arr name="attr_meta_word_count">
>       <str>10</str>
>     </arr>
>     <arr name="attr_meta_table_count">
>       <str>0</str>
>     </arr>
>     <arr name="attr_modified">
>       <str>2013-05-10T17:15:40.99</str>
>     </arr>
>     <arr name="attr_meta_image_count">
>       <str>0</str>
>     </arr>
>     <arr name="attr_xmptpg_npages">
>       <str>1</str>
>     </arr>
>     <arr name="attr_table_count">
>       <str>0</str>
>     </arr>
>     <arr name="attr_nbpara">
>       <str>4</str>
>     </arr>
>     <arr name="attr_character_count">
>       <str>60</str>
>     </arr>
>     <arr name="attr_meta_page_count">
>       <str>1</str>
>     </arr>
>     <arr name="attr_nbword">
>       <str>10</str>
>     </arr>
>     <arr name="attr_nbpage">
>       <str>1</str>
>     </arr>
>     <arr name="content_type">
>       <str>application/vnd.oasis.opendocument.text</str>
>     </arr>
>     <arr name="attr_nbcharacter">
>       <str>60</str>
>     </arr>
>     <arr name="content">
>       <str>  </str>
>     </arr>
>     <long name="_version_">1434688567598120960</long></doc>
> </result>
> </response>
> {code}
> Command to extract as text:
> {code}
> curl "http://localhost:8983/solr/update/extract?literal.id=doc-1&indent=true&extractOnly=true&extractFormat=text&commit=true" -F "myfile=@HelloWorld.odt"
> {code}
> The response:
> {code}
> <?xml version="1.0" encoding="UTF-8"?>
> <response>
> <lst name="responseHeader">
>   <int name="status">0</int>
>   <int name="QTime">124</int>
> </lst>
> <str name="HelloWorld.odt">
> Hello World, from OpenOffice!
> Third line.
> Fourth line.
> The end.
> </str>
> <lst name="HelloWorld.odt_metadata">
>   <arr name="Image-Count">
>     <str>0</str>
>   </arr>
>   <arr name="editing-cycles">
>     <str>1</str>
>   </arr>
>   <arr name="stream_source_info">
>     <str>myfile</str>
>   </arr>
>   <arr name="meta:save-date">
>     <str>2013-05-10T17:15:40.99</str>
>   </arr>
>   <arr name="dc:subject">
>     <str>Hello, World</str>
>   </arr>
>   <arr name="subject">
>     <str>Hello World - subject</str>
>   </arr>
>   <arr name="dcterms:created">
>     <str>2013-05-10T17:11:58.88</str>
>   </arr>
>   <arr name="date">
>     <str>2013-05-10T17:15:40.99</str>
>   </arr>
>   <arr name="dc:description">
>     <str>This is a test of SolrCell using OpenOffice 3.4.1 - OpenWriter.</str>
>   </arr>
>   <arr name="nbObject">
>     <str>0</str>
>   </arr>
>   <arr name="Word-Count">
>     <str>10</str>
>   </arr>
>   <arr name="Edit-Time">
>     <str>PT3M44S</str>
>   </arr>
>   <arr name="meta:paragraph-count">
>     <str>4</str>
>   </arr>
>   <arr name="Creation-Date">
>     <str>2013-05-10T17:11:58.88</str>
>   </arr>
>   <arr name="title">
>     <str>Hello World SolrCell Test - title</str>
>   </arr>
>   <arr name="Object-Count">
>     <str>0</str>
>   </arr>
>   <arr name="stream_content_type">
>     <str>application/octet-stream</str>
>   </arr>
>   <arr name="nbImg">
>     <str>0</str>
>   </arr>
>   <arr name="description">
>     <str>This is a test of SolrCell using OpenOffice 3.4.1 - OpenWriter.</str>
>   </arr>
>   <arr name="stream_size">
>     <str>8960</str>
>   </arr>
>   <arr name="meta:object-count">
>     <str>0</str>
>   </arr>
>   <arr name="cp:subject">
>     <str>Hello World - subject</str>
>   </arr>
>   <arr name="stream_name">
>     <str>HelloWorld.odt</str>
>   </arr>
>   <arr name="generator">
>     <str>OpenOffice.org/3.4.1$Win32 OpenOffice.org_project/341m1$Build-9593</str>
>   </arr>
>   <arr name="Keywords">
>     <str>Hello, World</str>
>   </arr>
>   <arr name="Last-Save-Date">
>     <str>2013-05-10T17:15:40.99</str>
>   </arr>
>   <arr name="Paragraph-Count">
>     <str>4</str>
>   </arr>
>   <arr name="dc:title">
>     <str>Hello World SolrCell Test - title</str>
>   </arr>
>   <arr name="dcterms:modified">
>     <str>2013-05-10T17:15:40.99</str>
>   </arr>
>   <arr name="meta:creation-date">
>     <str>2013-05-10T17:11:58.88</str>
>   </arr>
>   <arr name="Page-Count">
>     <str>1</str>
>   </arr>
>   <arr name="meta:character-count">
>     <str>60</str>
>   </arr>
>   <arr name="Last-Modified">
>     <str>2013-05-10T17:15:40.99</str>
>   </arr>
>   <arr name="nbTab">
>     <str>0</str>
>   </arr>
>   <arr name="meta:word-count">
>     <str>10</str>
>   </arr>
>   <arr name="meta:table-count">
>     <str>0</str>
>   </arr>
>   <arr name="modified">
>     <str>2013-05-10T17:15:40.99</str>
>   </arr>
>   <arr name="meta:image-count">
>     <str>0</str>
>   </arr>
>   <arr name="xmpTPg:NPages">
>     <str>1</str>
>   </arr>
>   <arr name="Table-Count">
>     <str>0</str>
>   </arr>
>   <arr name="nbPara">
>     <str>4</str>
>   </arr>
>   <arr name="Character Count">
>     <str>60</str>
>   </arr>
>   <arr name="meta:page-count">
>     <str>1</str>
>   </arr>
>   <arr name="nbWord">
>     <str>10</str>
>   </arr>
>   <arr name="nbPage">
>     <str>1</str>
>   </arr>
>   <arr name="Content-Type">
>     <str>application/vnd.oasis.opendocument.text</str>
>   </arr>
>   <arr name="nbCharacter">
>     <str>60</str>
>   </arr>
> </lst>
> </response>
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: dev-help@lucene.apache.org