You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/09/26 16:49:05 UTC

svn commit: r1390531 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/rtf/ tika-parsers/src/test/java/org/apache/tika/parser/rtf/ tika-parsers/src/test/resources/test-documents/

Author: mikemccand
Date: Wed Sep 26 14:49:05 2012
New Revision: 1390531

URL: http://svn.apache.org/viewvc?rev=1390531&view=rev
Log:
TIKA-999: extract page, word, character count metadata from RTF docs

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/test_embedded_package.rtf   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1390531&r1=1390530&r2=1390531&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed Sep 26 14:49:05 2012
@@ -25,6 +25,9 @@ Release 1.3 - Current Development
     key, and TikaCLI prepends the rId (if present) onto the filename
     it extracts (TIKA-989).
 
+  * RTF: Page, word and character count metadata are now extracted for
+    RTF documents (TIKA-999).
+
 Release 1.2 - 07/10/2012
 ---------------------------------
 

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1390531&r1=1390530&r2=1390531&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java Wed Sep 26 14:49:05 2012
@@ -30,9 +30,9 @@ import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.Map;
 
-
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.OfficeOpenXMLCore;
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.Property;
@@ -782,6 +782,12 @@ final class TextExtractor {
             } else if (equals("deff")) {
                 // Default font
                 globalDefaultFont = param;
+            } else if (equals("nofpages")) {
+                metadata.add(Office.PAGE_COUNT, Integer.toString(param));
+            } else if (equals("nofwords")) {
+                metadata.add(Office.WORD_COUNT, Integer.toString(param));
+            } else if (equals("nofchars")) {
+                metadata.add(Office.CHARACTER_COUNT, Integer.toString(param));
             }
 
             if (fontTableState == 1) {
@@ -905,8 +911,7 @@ final class TextExtractor {
 
             if (uprState == -1) {
                 // TODO: we can also parse \creatim, \revtim,
-                // \printim, \version, \nofpages, \nofwords,
-                // \nofchars, etc.
+                // \printim, \version, etc.
                 if (equals("author")) {
                     nextMetaData = TikaCoreProperties.CREATOR;
                 } else if (equals("title")) {

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1390531&r1=1390530&r2=1390531&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Wed Sep 26 14:49:05 2012
@@ -20,7 +20,6 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.InputStream;
 import java.io.StringWriter;
-
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.sax.SAXTransformerFactory;
 import javax.xml.transform.sax.TransformerHandler;
@@ -30,6 +29,7 @@ import org.apache.tika.Tika;
 import org.apache.tika.TikaTest;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.OfficeOpenXMLCore;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
@@ -320,6 +320,14 @@ public class RTFParserTest extends TikaT
         assertTrue(getXML("testBinControlWord.rtf").xml.indexOf("\u00ff\u00ff\u00ff\u00ff") == -1);
     }
 
+    // TIKA-999
+    public void testMetaDataCounts() throws Exception {
+      XMLResult xml = getXML("test_embedded_package.rtf");
+      assertEquals("1", xml.metadata.get(Office.PAGE_COUNT));
+      assertEquals("7", xml.metadata.get(Office.WORD_COUNT));
+      assertEquals("36", xml.metadata.get(Office.CHARACTER_COUNT));
+    }
+
     private Result getResult(String filename) throws Exception {
         File file = getResourceAsFile("/test-documents/" + filename);
        

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/test_embedded_package.rtf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/test_embedded_package.rtf?rev=1390531&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/test_embedded_package.rtf (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/test_embedded_package.rtf Wed Sep 26 14:49:05 2012
@@ -0,0 +1,71 @@
+{\rtf1\adeflang1037\ansi\ansicpg1255\uc1\adeff0\deff0\stshfdbch0\stshfloch0\stshfhich0\stshfbi0\deflang1033\deflangfe1033{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f42\froman\fcharset238\fprq2 Times New Roman CE;}{\f43\froman\fcharset204\fprq2 Times New Roman Cyr;}
+{\f45\froman\fcharset161\fprq2 Times New Roman Greek;}{\f46\froman\fcharset162\fprq2 Times New Roman Tur;}{\f47\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f48\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}
+{\f49\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f50\froman\fcharset163\fprq2 Times New Roman (Vietnamese);}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;
+\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}
+{\stylesheet{\rtlpar \qr \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs24\alang1037 \ltrch\fcs0 \fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \snext0 Normal;}{\*\cs10 \additive 
+\ssemihidden Default Paragraph Font;}{\*
+\ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tblind0\tblindtype3\tscellwidthfts0\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv 
+\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs20 \ltrch\fcs0 \fs20\lang1024\langfe1024\cgrid\langnp1024\langfenp1024 \snext11 \ssemihidden Normal Table;}}
+{\*\latentstyles\lsdstimax156\lsdlockeddef0}{\*\rsidtbl \rsid80757\rsid88682\rsid95691\rsid338110\rsid397126\rsid464105\rsid610070\rsid725275\rsid726039\rsid735318\rsid878906\rsid944797\rsid990930\rsid1050282\rsid1055534\rsid1120643\rsid1189299
+\rsid1261895\rsid1326734\rsid1443796\rsid1446790\rsid1517719\rsid1523823\rsid1528736\rsid1663143\rsid1727465\rsid2036260\rsid2038523\rsid2059876\rsid2063302\rsid2187654\rsid2312151\rsid2426540\rsid2437546\rsid2565040\rsid2640885\rsid2648812\rsid2772643
+\rsid2773503\rsid2912151\rsid2969817\rsid3040309\rsid3158843\rsid3233420\rsid3236045\rsid3428477\rsid3670425\rsid3680487\rsid3742951\rsid3802514\rsid3896077\rsid4201996\rsid4418743\rsid4529350\rsid4532076\rsid4543580\rsid4550860\rsid4603271\rsid4619240
+\rsid4922385\rsid4994523\rsid4998948\rsid5076894\rsid5178319\rsid5266379\rsid5271913\rsid5319055\rsid5450515\rsid5465573\rsid5518356\rsid5600380\rsid5644040\rsid5733174\rsid5862467\rsid6039985\rsid6160579\rsid6161425\rsid6227723\rsid6436153\rsid6455244
+\rsid6559006\rsid6751148\rsid6774544\rsid6781683\rsid6841230\rsid6969526\rsid7017482\rsid7172550\rsid7354085\rsid7498469\rsid7685209\rsid7752870\rsid7815296\rsid7996767\rsid8137208\rsid8143210\rsid8158045\rsid8216425\rsid8278853\rsid8352996\rsid8400897
+\rsid8524781\rsid8616561\rsid8616769\rsid8723851\rsid8989021\rsid9000058\rsid9134755\rsid9335952\rsid9502787\rsid9860668\rsid9920906\rsid10048992\rsid10059037\rsid10110147\rsid10111401\rsid10113190\rsid10223797\rsid10224095\rsid10318407\rsid10358444
+\rsid10383241\rsid10688002\rsid10699260\rsid10704166\rsid10814210\rsid10964041\rsid11013754\rsid11077434\rsid11092724\rsid11287967\rsid11406514\rsid11481842\rsid11491890\rsid11607823\rsid11610900\rsid11619723\rsid11762481\rsid11797934\rsid11817636
+\rsid11867236\rsid11930369\rsid12210217\rsid12285473\rsid12453808\rsid12477582\rsid12532195\rsid12651456\rsid12721355\rsid12739128\rsid12875027\rsid12980573\rsid13122264\rsid13180963\rsid13186720\rsid13319286\rsid13333079\rsid13379867\rsid13394080
+\rsid13446699\rsid13517320\rsid13790272\rsid14032234\rsid14043176\rsid14095935\rsid14173935\rsid14182736\rsid14183554\rsid14245239\rsid14567860\rsid14572862\rsid14627854\rsid14877129\rsid14882513\rsid14906790\rsid15012029\rsid15012693\rsid15162518
+\rsid15410632\rsid15600659\rsid15666998\rsid15671610\rsid15734100\rsid15798587\rsid16147071\rsid16198390\rsid16199144\rsid16396019\rsid16527675}{\*\generator Microsoft Word 11.0.0000;}{\info{\author IBMer}{\operator IBMer}
+{\creatim\yr2012\mo9\dy2\hr13\min1}{\revtim\yr2012\mo9\dy3\hr10\min27}{\version4}{\edmins3}{\nofpages1}{\nofwords7}{\nofchars36}{\*\company IBM}{\nofcharsws42}{\vern24615}{\*\password 00000000}}{\*\xmlnstbl {\xmlns1 http://schemas.microsoft.com/office/wor
+d/2003/wordml}}\paperw11906\paperh16838\margl1800\margr1800\margt1440\margb1440\gutter0\rtlsect\rtlgutter 
+\widowctrl\ftnbj\aenddoc\donotembedsysfont0\donotembedlingdata0\grfdocevents0\validatexml1\showplaceholdtext0\ignoremixedcontent0\saveinvalidxml0\showxmlerrors1\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\formshade\horzdoc\dgmargin\dghspace180
+\dgvspace180\dghorigin1800\dgvorigin1440\dghshow1\dgvshow1
+\jexpand\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule\nobrkwrptbl\snaptogridincell\allowfieldendsel\wrppunct
+\asianbrkrule\rsidroot7354085\newtblstyruls\nogrowautofit \fet0{\*\wgrffmtfilter 013f}\ilfomacatclnup0\rtlpar \sectd \rtlsect\rtlgutter\linex0\headery708\footery708\colsx708\endnhere\sectlinegrid360\sectdefaultcl\sectrsid10358444\sftnbj {\*\pnseclvl1
+\pnucrm\pnqc\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl2\pnucltr\pnqc\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl3\pndec\pnqc\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl4\pnlcltr\pnqc\pnstart1\pnindent720\pnhang {\pntxta )}}
+{\*\pnseclvl5\pndec\pnqc\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnqc\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnqc\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl8
+\pnlcltr\pnqc\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnqc\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}\pard\plain \rtlpar
+\qr \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid5733174 \rtlch\fcs1 \af0\afs24\alang1037 \ltrch\fcs0 \fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\ltrch\fcs1 \rtlch\fcs0 
+\f0\fs24\lang1037\insrsid16147071 
+\par }{\pard\plain \rtlpar\qr \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid5733174 \rtlch\fcs1 \af0\afs24\alang1037 \ltrch\fcs0 \fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033\insrsid5733174 {\object\objemb
+\objw628\objh765{\*\objclass Package}{\*\objdata 0105000002000000080000005061636b61676500000000000000000066000000
+020048772e74787400433a5c444f43554d457e315c6967616c73685c4465736b746f705c48572e747874000000030022000000433a5c444f43554d457e315c6967616c73685c4465736b746f705c48572e747874000b00000048656c6c6f20576f726c64000001050000050000000d0000004d45544146494c455049435400
+54040000bbfaffffee0000000800540445050000
+0100090000037300000002001c0000000000050000000b0200000000050000000c02320029001c000000fb02f5ff000000000000900100000001000000005461686f6d61000055170a7000fc070058b1f37761b1f3772040f57749366683040000002d01000005000000090200000000050000000102ffffff0005000000
+020101000000050000002e0106000000090000002105060048772e747874210015001c000000fb021000070000000000bc02000000000102022253797374656d00004936668300000a0026008a0100000000ffffffff8cfc0700040000002d010100030000000000}{\result {\rtlch\fcs1 \af0 \ltrch\fcs0 
+\insrsid1663143 {\*\shppict{\pict{\*\picprop\shplid1025{\sp{\sn shapeType}{\sv 75}}{\sp{\sn fFlipH}{\sv 0}}{\sp{\sn fFlipV}{\sv 0}}{\sp{\sn fLockAspectRatio}{\sv 1}}{\sp{\sn pictureGray}{\sv 0}}
+{\sp{\sn pictureBiLevel}{\sv 0}}{\sp{\sn fRecolorFillAsPicture}{\sv 0}}{\sp{\sn fUseShapeAnchor}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn fHitTestFill}{\sv 1}}{\sp{\sn fillShape}{\sv 1}}{\sp{\sn fillUseRect}{\sv 0}}{\sp{\sn fNoFillHitTest}{\sv 0}}
+{\sp{\sn fLine}{\sv 0}}{\sp{\sn fPreferRelativeResize}{\sv 1}}{\sp{\sn fLayoutInCell}{\sv 1}}{\sp{\sn fReallyHidden}{\sv 0}}{\sp{\sn fScriptAnchor}{\sv 0}}{\sp{\sn fFakeMaster}{\sv 0}}{\sp{\sn fCameFromImgDummy}{\sv 0}}{\sp{\sn fLayoutInCell}{\sv 1}}}
+\picscalex100\picscaley100\piccropl0\piccropr0\piccropt0\piccropb0\picw1108\pich1349\picwgoal628\pichgoal765\emfblip\bliptag-1509580894{\*\blipuid a6059fa2509e8d66cab566a1d6bef7b5}
+010000006c00000021000000d3000000e9000000260100000000000000000000530400004605000020454d4600000100c8050000180000000300000000000000
+0000000000000000c012000000190000cb0000000f010000000000000000000000000000c019030055220400460000000c010000fe0000004744494301000080
+00030000964e894000000000e60000000100090000037300000002001c0000000000050000000b0200000000050000000c02320029001c000000fb02f5ff0000
+00000000900100000001000000005461686f6d61000055170a7000fc070058b1f37761b1f3772040f57749366683040000002d01000005000000090200000000
+050000000102ffffff0005000000020101000000050000002e0106000000090000002105060048772e747874210015001c000000fb021000070000000000bc02
+000000000102022253797374656d00004936668300000a0026008a0100000000ffffffff8cfc0700040000002d0101000300000000000000110000000c000000
+080000000b00000010000000060100003f0100000900000010000000060100003f0100000a000000100000000000000000000000090000001000000029000000
+32000000520000007001000001000000f5ffffff0000000000000000000000009001000000000001000000005400610068006f006d0061000000000000000000
+00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000917c90da130020e9907c2802917cffffffff
+2202917c9b01917cdb01917c0000000080db1300cb3e210c0000000015000000045538033e1306300000000000000000a0351700000000000000000014000000
+045538030000000000000000000000000000000000000000000000000000000000003f0200000000000000000000000000000000000000000000000078011700
+000000000000000001000000000000000817bc00a0351700041eaf040000000000000000b8435002000000000000000010f75a02e0714002e071400254d91300
+5e5af37754d913007024f27734284f0209000000e0714002ccbc5b02c401000068d91300ce33f277100000006476000800000000250000000c00000001000000
+180000000c00000000000000190000000c000000ffffff00120000000c00000001000000160000000c00000006000000540000007000000021000000d3000000
+e900000026010000010000000934d8415c1fd8411500000021000000060000004c000000000000000000000000000000ffffffffffffffff5800000048007700
+2e0074007800740007000000080000000400000003000000060000000300000052000000700100000200000010000000070000000000000000000000bc020000
+0000000001020222530079007300740065006d000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+000000000000000000000000cb3e210c480077002e007400780074000000000000000000000000000000000000000000000000000000000000003f0200000000
+000000000000000000000000000000000000000078011700000000000000000001000000000000000817bc00a0351700041eaf040000000000000000b8435002
+000000000000000000005a02e0714002e071400254d913005e5af37754d913007024f27734284f0209000000e0714002ccbc5b02c401000068d91300ce33f277
+10000000647600080000000010f75a02e071400250d913005e5af37750d913007024f277b4294f0225000000e0714002e071400270d913005e5af37770d91300
+7024f277e4294f026476000800000000250000000c00000002000000250000000c00000007000080250000000c00000000000080300000000c0000000f000080
+250000000c0000000e0000804b000000100000000000000005000000280000000c00000001000000280000000c000000020000000e00000014000000000000001000000014000000}}{\nonshppict{\pict\picscalex100\picscaley100\piccropl0\piccropr0\piccropt0\piccropb0
+\picw1108\pich1349\picwgoal628\pichgoal765\wmetafile8\bliptag-1509580894\blipupi600{\*\blipuid a6059fa2509e8d66cab566a1d6bef7b5}
+0100090000037300000002001c0000000000050000000b0200000000050000000c02320029001c000000fb02f5ff000000000000900100000001000000005461
+686f6d61000055170a7000fc070058b1f37761b1f3772040f57749366683040000002d01000005000000090200000000050000000102ffffff00050000000201
+01000000050000002e0106000000090000002105060048772e747874210015001c000000fb021000070000000000bc02000000000102022253797374656d00004936668300000a0026008a0100000000ffffffff8cfc0700040000002d010100030000000000}}}}}}\sectd \rtlsect\rtlgutter
+\linex0\headery708\footery708\colsx708\endnhere\sectlinegrid360\sectdefaultcl\sectrsid10358444\sftnbj {\ltrch\fcs1 \rtlch\fcs0 \f0\fs24\lang1037\insrsid10358444 
+\par }{\ltrch\fcs1 \rtlch\fcs0 \f0\fs24\lang1037\insrsid16147071 
+\par }\pard \rtlpar\qr \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid16147071 {\rtlch\fcs1 \af0 \ltrch\fcs0 \insrsid16147071 This file is embedded
+\par }\pard \rtlpar\qr \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid5733174 {\rtlch\fcs1 \af0 \ltrch\fcs0 \insrsid16147071\charrsid5733174 
+\par }}
\ No newline at end of file

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/test_embedded_package.rtf
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/test_embedded_package.rtf
------------------------------------------------------------------------------
    svn:executable = *