You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ch...@apache.org on 2015/07/15 08:21:09 UTC

svn commit: r1691126 - in /jackrabbit/oak/branches/1.0: ./ oak-run/ oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/ oak-run/src/main/java/org/apache/jackrabbit/oak/run/ oak-run/src/main/resources/ oak-run/src/test/java/org/apache/jackrabb...

Author: chetanm
Date: Wed Jul 15 06:21:09 2015
New Revision: 1691126

URL: http://svn.apache.org/r1691126
Log:
OAK-2953 - Implement text extractor as part of oak-run

Merging 1690249,1690636,1690669 with following changes
-- Excluded tika-parser from getting included in shaded jar
-- Use the shade plugin instead of assembly as assembly plugin is not configured on 1.0
-- Some new API (TreeFactory) not present on branch so adapt code accordingly


Added:
    jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/
      - copied from r1690249, jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/
    jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java
      - copied, changed from r1690636, jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java
    jackrabbit/oak/branches/1.0/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/
      - copied from r1690249, jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/
Modified:
    jackrabbit/oak/branches/1.0/   (props changed)
    jackrabbit/oak/branches/1.0/oak-run/pom.xml
    jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java
    jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java
    jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java
    jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java
    jackrabbit/oak/branches/1.0/oak-run/src/main/resources/logback.xml
    jackrabbit/oak/branches/1.0/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java

Propchange: jackrabbit/oak/branches/1.0/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Wed Jul 15 06:21:09 2015
@@ -1,2 +1,2 @@
-/jackrabbit/oak/trunk:1584578,1584602,1584614,1584616,1584709,1584781,1584937,1585297,1585304-1585305,1585420,1585424,1585427,1585448,1585465,1585468,1585486,1585497,1585509,1585647,1585655-1585656,1585661,1585665-1585666,1585669-1585670,1585673,1585680,1585719,1585763,1585770,1585896,1585904,1585907,1585940,1585949,1585951,1585956,1585962-1585963,1586287,1586320,1586364,1586372,1586655,1586836,1587130,1587224,1587399,1587408,1587472,1587485,1587488,1587538,1587580,1587807,1588033,1588042,1588046,1588066,1588201,1589025,1589101,1589137,1589141,1589263,1589440,1589442,1589484,1589488,1589661,1589664,1589682,1589708,1589741,1589748,1589789,1589794,1589850,1589864,1590628,1590660,1590684,1590697,1590701,1590980,1590988,1591101,1591226,1591229,1591293,1591314,1591317,1591362,1591374,1591381,1591438,1591467,1591552,1591704,1591713,1591715,1591723,1591874,1592487,1592512,1592658,1592665,1592677,1592742,1592744,1592787,1592809,1592955,1593036,1593048,1593061,1593133,1593210-1593211,1593231
 ,1593245,1593250,1593294,1593304,1593317,1593342,1593554,1594158-1594164,1594166-1594167,1594169,1594237,1594800,1594808,1594835,1594888,1594987,1595147,1595457,1595856,1596241,1596474,1596534,1596844,1596889,1597569,1597795,1597854,1597860,1598292,1598302,1598352,1598369,1598595,1598631,1598696,1598732,1598797-1598798,1599160,1599299,1599332,1599416,1599434,1599671,1600088,1600935,1601309,1601388,1601578,1601649,1601676,1601757,1601768,1601814,1601833,1601838,1601853,1601878,1601888,1601922,1602156,1602170,1602174,1602179,1602183,1602201,1602207,1602227,1602256,1602261,1602342,1602796-1602797,1602800,1602809,1602853,1602872,1602914,1603155,1603307,1603401,1603441,1603748,1604166,1605030,1605036,1605038,1605292,1605447,1605526,1605670,1605725,1605831,1605852,1606077,1606079,1606087,1606638,1606641,1606644,1606708,1606711,1607031-1607032,1607077,1607127,1607141,1607152,1607185,1607196,1607331,1607362,1607366,1607392,1607526,1607557,1607664,1607737,1608560,1608731,1608783,1609064,1609
 081,1609165,1609214,1609488,1610489,1610592,1610603,1610634,1610658,1610664,1611021,1611041,1611270,1611275,1611277,1611313,1611332,1611584,1612560,1612825,1612848,1612892,1612899,1612993,1613018,1613041,1614032,1614265,1614272,1614344-1614345,1614384-1614385,1614397,1614405-1614406,1614574,1614591,1614593,1614596,1614604,1614689,1614807,1614835,1614891,1615417-1615418,1616182,1616236,1616463,1616719,1617417,1617451,1617463,1617711,1618158,1618613,1618624,1618709,1619222,1619411,1619695,1619800,1619808,1619815,1619823-1619824,1620512,1620581,1620585,1620634,1620898,1620905,1621115,1621123-1621124,1621168,1621192,1621201,1621706,1621962,1622197,1622201,1622207,1622250,1622479,1623364,1623766,1623827,1623949,1623969,1623973,1624216,1624317,1624551,1624559,1624973,1624993-1624994,1625025,1625036,1625158,1625224,1625237,1625299,1625348,1625620,1625916,1625962-1625963,1626021,1626053,1626163,1626168,1626175,1626191,1626265,1626770,1627047,1627052,1627228,1627346,1627470,1627473,1627479,1
 627503,1627586,1627590,1627715,1627731,1628180,1628198,1628262,1628447,1628608,1629688,1629840,1629858,1629917,1630055-1630057,1630156,1630299,1630338,1630773,1631283-1631284,1631333-1631334,1631617-1631619,1631630,1631699,1631704,1631711,1631967-1631969,1631986,1631990,1631999,1632002-1632003,1632017,1632258,1632264,1632270,1632293,1632303,1632592,1632605,1633315,1633389,1633559-1633560,1633562,1633567,1633571,1633598,1633608,1633641,1633687,1633697,1633768,1633783,1634505,1634513,1634774,1634779,1634781,1634792,1634803,1634814,1634816,1634838,1634841,1634852,1634864,1634896,1634898,1635044-1635045,1635060,1635077,1635089,1635102,1635108,1635178,1635218,1635387,1635435,1635518,1635563,1635586,1636336,1636348,1636505,1636585,1636799,1637368,1637382,1637413,1637651,1637815,1638779-1638783,1639260,1639577,1639622,1639963,1639966,1639973,1640134,1640143,1640523,1640555-1640556,1640694-1640695,1640715,1640722-1640723,1640728,1640863-1640872,1641340,1641346,1641350,1641352,1641541,164159
 6-1641599,1641601,1641662,1641671,1641695,1641771,1641802,1641811,1641950,1642031,1642056,1642119,1642285,1642648,1642667,1642954,1642959,1643111,1643178,1643186,1643204,1643287,1643767,1643774,1643982,1644016,1644106,1644366,1644383,1644397-1644398,1644407,1644479,1644547,1644552,1644554,1644588,1644645,1644650,1644654,1644689,1644750,1645421,1645424,1645459,1645585,1645611,1645637,1645646,1645660-1645663,1645888,1645901,1645948,1645966,1645970-1645971,1646014,1646164,1646174,1646469,1646684,1646687,1646726-1646728,1646766,1646795,1646981,1649743,1649803,1650015,1650239,1650529,1650797,1651323,1651382,1651643,1651652,1651730,1651988-1651989,1651996,1652024,1652035,1652058-1652059,1652075,1652127,1652158,1652467,1652965,1652971,1652992,1653207,1653446,1653463,1653484,1653572,1653579,1653591,1653804,1653809,1653813,1653848-1653850,1653882,1654116,1654174,1654743,1654756,1654778,1655028,1655049,1655054-1655055,1655086,1655237,1655248,1655996,1656019,1656027,1656033,1656303,1656394,165
 6400,1656425,1656427,1656453,1656628,1656678,1657128,1657132,1657163,1657188,1657265,1657511,1657766,1657804,1658470,1658977,1658983,1659285,1659483,1659527,1659550,1659578,1659765,1660100,1660154-1660155,1660383-1660384,1660409,1660426,1660676,1660870,1660872,1660897,1660903,1661069,1661122,1661146,1661158,1661226,1661630,1661643,1661645,1662313-1662315,1662323,1662381,1662450,1662456,1663241,1663275,1663288,1663448,1663526,1663528,1663565,1663578,1663666,1663705,1663730,1663753,1663854,1664038,1664184,1664228-1664229,1664231,1664381,1664569,1664947,1664987,1665184,1665257,1665271-1665272,1665274-1665275,1665436,1665604,1665634,1665758,1665835,1665892,1665897,1665910,1665918,1666100,1666102,1666177,1666218,1666220,1666351-1666352,1666381,1666384,1666426,1666491,1666787,1667062,1667184,1667293,1667462,1667498,1667502,1667573,1667590,1667696,1667782,1668160,1668275,1668624,1668641,1668645,1668649,1668665,1668671,1668683,1668688,1668845,1669072,1669096,1669135,1669337,1669361,1669579,
 1669680,1669989,1670030,1670693,1670705,1671489,1671512,1671773,1671787,1671795,1672055,1672277,1672350,1672468,1672537,1672603,1672642,1672644,1672834-1672835,1673351,1673410,1673431,1673436,1673644,1673662-1673663,1673695,1673738,1673787,1673791,1674046,1674065,1674075,1674107,1674150,1674780,1675054,1675319,1675332,1675382,1675555,1675566,1676198,1676407,1676458,1676670,1676703,1677579,1677609,1677611,1677774,1677788,1677797,1677939,1677991,1678095-1678096,1678173,1678323,1678758,1678938,1678954,1679144,1679147,1679165,1679191,1679232,1679503,1679961,1680170,1680182,1680222,1680232,1680236,1680461,1680633,1680643,1680747,1680805-1680806,1680903,1681282,1681767,1681918,1682218,1682235,1682437,1682494,1682555,1682855,1682904,1683059,1683089,1683213,1683249,1683259,1683278,1683323,1683687,1683700,1684174,1684376,1684442,1684561,1684570,1684618,1684836,1684868,1685023,1685370,1685552,1685589,1685840,1685999,1686097,1686229,1686234,1686253,1686414,1686780,1686854,1686857,1686971,16870
 53,1687175,1687196,1687198,1687220,1687239-1687240,1687301,1687441,1687553,1688090,1688349,1688421,1688436,1688453,1688622,1688636,1688817,1689003-1689004,1689008,1689577,1689581,1689623,1689774,1689810,1689828,1689833,1689903,1690017,1690043,1690047,1690057,1690247,1690634-1690635,1690637,1690650,1690674,1690941
+/jackrabbit/oak/trunk:1584578,1584602,1584614,1584616,1584709,1584781,1584937,1585297,1585304-1585305,1585420,1585424,1585427,1585448,1585465,1585468,1585486,1585497,1585509,1585647,1585655-1585656,1585661,1585665-1585666,1585669-1585670,1585673,1585680,1585719,1585763,1585770,1585896,1585904,1585907,1585940,1585949,1585951,1585956,1585962-1585963,1586287,1586320,1586364,1586372,1586655,1586836,1587130,1587224,1587399,1587408,1587472,1587485,1587488,1587538,1587580,1587807,1588033,1588042,1588046,1588066,1588201,1589025,1589101,1589137,1589141,1589263,1589440,1589442,1589484,1589488,1589661,1589664,1589682,1589708,1589741,1589748,1589789,1589794,1589850,1589864,1590628,1590660,1590684,1590697,1590701,1590980,1590988,1591101,1591226,1591229,1591293,1591314,1591317,1591362,1591374,1591381,1591438,1591467,1591552,1591704,1591713,1591715,1591723,1591874,1592487,1592512,1592658,1592665,1592677,1592742,1592744,1592787,1592809,1592955,1593036,1593048,1593061,1593133,1593210-1593211,1593231
 ,1593245,1593250,1593294,1593304,1593317,1593342,1593554,1594158-1594164,1594166-1594167,1594169,1594237,1594800,1594808,1594835,1594888,1594987,1595147,1595457,1595856,1596241,1596474,1596534,1596844,1596889,1597569,1597795,1597854,1597860,1598292,1598302,1598352,1598369,1598595,1598631,1598696,1598732,1598797-1598798,1599160,1599299,1599332,1599416,1599434,1599671,1600088,1600935,1601309,1601388,1601578,1601649,1601676,1601757,1601768,1601814,1601833,1601838,1601853,1601878,1601888,1601922,1602156,1602170,1602174,1602179,1602183,1602201,1602207,1602227,1602256,1602261,1602342,1602796-1602797,1602800,1602809,1602853,1602872,1602914,1603155,1603307,1603401,1603441,1603748,1604166,1605030,1605036,1605038,1605292,1605447,1605526,1605670,1605725,1605831,1605852,1606077,1606079,1606087,1606638,1606641,1606644,1606708,1606711,1607031-1607032,1607077,1607127,1607141,1607152,1607185,1607196,1607331,1607362,1607366,1607392,1607526,1607557,1607664,1607737,1608560,1608731,1608783,1609064,1609
 081,1609165,1609214,1609488,1610489,1610592,1610603,1610634,1610658,1610664,1611021,1611041,1611270,1611275,1611277,1611313,1611332,1611584,1612560,1612825,1612848,1612892,1612899,1612993,1613018,1613041,1614032,1614265,1614272,1614344-1614345,1614384-1614385,1614397,1614405-1614406,1614574,1614591,1614593,1614596,1614604,1614689,1614807,1614835,1614891,1615417-1615418,1616182,1616236,1616463,1616719,1617417,1617451,1617463,1617711,1618158,1618613,1618624,1618709,1619222,1619411,1619695,1619800,1619808,1619815,1619823-1619824,1620512,1620581,1620585,1620634,1620898,1620905,1621115,1621123-1621124,1621168,1621192,1621201,1621706,1621962,1622197,1622201,1622207,1622250,1622479,1623364,1623766,1623827,1623949,1623969,1623973,1624216,1624317,1624551,1624559,1624973,1624993-1624994,1625025,1625036,1625158,1625224,1625237,1625299,1625348,1625620,1625916,1625962-1625963,1626021,1626053,1626163,1626168,1626175,1626191,1626265,1626770,1627047,1627052,1627228,1627346,1627470,1627473,1627479,1
 627503,1627586,1627590,1627715,1627731,1628180,1628198,1628262,1628447,1628608,1629688,1629840,1629858,1629917,1630055-1630057,1630156,1630299,1630338,1630773,1631283-1631284,1631333-1631334,1631617-1631619,1631630,1631699,1631704,1631711,1631967-1631969,1631986,1631990,1631999,1632002-1632003,1632017,1632258,1632264,1632270,1632293,1632303,1632592,1632605,1633315,1633389,1633559-1633560,1633562,1633567,1633571,1633598,1633608,1633641,1633687,1633697,1633768,1633783,1634505,1634513,1634774,1634779,1634781,1634792,1634803,1634814,1634816,1634838,1634841,1634852,1634864,1634896,1634898,1635044-1635045,1635060,1635077,1635089,1635102,1635108,1635178,1635218,1635387,1635435,1635518,1635563,1635586,1636336,1636348,1636505,1636585,1636799,1637368,1637382,1637413,1637651,1637815,1638779-1638783,1639260,1639577,1639622,1639963,1639966,1639973,1640134,1640143,1640523,1640555-1640556,1640694-1640695,1640715,1640722-1640723,1640728,1640863-1640872,1641340,1641346,1641350,1641352,1641541,164159
 6-1641599,1641601,1641662,1641671,1641695,1641771,1641802,1641811,1641950,1642031,1642056,1642119,1642285,1642648,1642667,1642954,1642959,1643111,1643178,1643186,1643204,1643287,1643767,1643774,1643982,1644016,1644106,1644366,1644383,1644397-1644398,1644407,1644479,1644547,1644552,1644554,1644588,1644645,1644650,1644654,1644689,1644750,1645421,1645424,1645459,1645585,1645611,1645637,1645646,1645660-1645663,1645888,1645901,1645948,1645966,1645970-1645971,1646014,1646164,1646174,1646469,1646684,1646687,1646726-1646728,1646766,1646795,1646981,1649743,1649803,1650015,1650239,1650529,1650797,1651323,1651382,1651643,1651652,1651730,1651988-1651989,1651996,1652024,1652035,1652058-1652059,1652075,1652127,1652158,1652467,1652965,1652971,1652992,1653207,1653446,1653463,1653484,1653572,1653579,1653591,1653804,1653809,1653813,1653848-1653850,1653882,1654116,1654174,1654743,1654756,1654778,1655028,1655049,1655054-1655055,1655086,1655237,1655248,1655996,1656019,1656027,1656033,1656303,1656394,165
 6400,1656425,1656427,1656453,1656628,1656678,1657128,1657132,1657163,1657188,1657265,1657511,1657766,1657804,1658470,1658977,1658983,1659285,1659483,1659527,1659550,1659578,1659765,1660100,1660154-1660155,1660383-1660384,1660409,1660426,1660676,1660870,1660872,1660897,1660903,1661069,1661122,1661146,1661158,1661226,1661630,1661643,1661645,1662313-1662315,1662323,1662381,1662450,1662456,1663241,1663275,1663288,1663448,1663526,1663528,1663565,1663578,1663666,1663705,1663730,1663753,1663854,1664038,1664184,1664228-1664229,1664231,1664381,1664569,1664947,1664987,1665184,1665257,1665271-1665272,1665274-1665275,1665436,1665604,1665634,1665758,1665835,1665892,1665897,1665910,1665918,1666100,1666102,1666177,1666218,1666220,1666351-1666352,1666381,1666384,1666426,1666491,1666787,1667062,1667184,1667293,1667462,1667498,1667502,1667573,1667590,1667696,1667782,1668160,1668275,1668624,1668641,1668645,1668649,1668665,1668671,1668683,1668688,1668845,1669072,1669096,1669135,1669337,1669361,1669579,
 1669680,1669989,1670030,1670693,1670705,1671489,1671512,1671773,1671787,1671795,1672055,1672277,1672350,1672468,1672537,1672603,1672642,1672644,1672834-1672835,1673351,1673410,1673431,1673436,1673644,1673662-1673663,1673695,1673738,1673787,1673791,1674046,1674065,1674075,1674107,1674150,1674780,1675054,1675319,1675332,1675382,1675555,1675566,1676198,1676407,1676458,1676670,1676703,1677579,1677609,1677611,1677774,1677788,1677797,1677939,1677991,1678095-1678096,1678173,1678323,1678758,1678938,1678954,1679144,1679147,1679165,1679191,1679232,1679503,1679961,1680170,1680182,1680222,1680232,1680236,1680461,1680633,1680643,1680747,1680805-1680806,1680903,1681282,1681767,1681918,1682218,1682235,1682437,1682494,1682555,1682855,1682904,1683059,1683089,1683213,1683249,1683259,1683278,1683323,1683687,1683700,1684174,1684376,1684442,1684561,1684570,1684618,1684836,1684868,1685023,1685370,1685552,1685589,1685840,1685999,1686097,1686229,1686234,1686253,1686414,1686780,1686854,1686857,1686971,16870
 53,1687175,1687196,1687198,1687220,1687239-1687240,1687301,1687441,1687553,1688090,1688349,1688421,1688436,1688453,1688622,1688636,1688817,1689003-1689004,1689008,1689577,1689581,1689623,1689774,1689810,1689828,1689833,1689903,1690017,1690043,1690047,1690057,1690247,1690249,1690634-1690637,1690650,1690669,1690674,1690941
 /jackrabbit/trunk:1345480

Modified: jackrabbit/oak/branches/1.0/oak-run/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.0/oak-run/pom.xml?rev=1691126&r1=1691125&r2=1691126&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.0/oak-run/pom.xml (original)
+++ jackrabbit/oak/branches/1.0/oak-run/pom.xml Wed Jul 15 06:21:09 2015
@@ -190,7 +190,27 @@
       <groupId>org.apache.jclouds.provider</groupId>
       <artifactId>aws-s3</artifactId>
     </dependency>
-    
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>1.5</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-csv</artifactId>
+      <version>1.1</version>
+    </dependency>
+    <!--
+      Set parsers to test scope as pulling this in shaded jar
+      results in pulling lot more dependencies
+    -->
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-parsers</artifactId>
+      <version>1.5</version>
+      <scope>test</scope>
+    </dependency>
+
     <!-- Findbugs annotations -->
     <dependency>
       <groupId>com.google.code.findbugs</groupId>

Modified: jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java?rev=1691126&r1=1690249&r2=1691126&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java (original)
+++ jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java Wed Jul 15 06:21:09 2015
@@ -31,7 +31,6 @@ import java.util.Map;
 import com.google.common.base.Strings;
 import com.google.common.collect.ComparisonChain;
 import com.google.common.collect.Maps;
-import org.codehaus.groovy.runtime.StringGroovyMethods;
 
 import static org.apache.jackrabbit.oak.commons.IOUtils.humanReadableByteCount;
 
@@ -141,7 +140,8 @@ class BinaryStats {
     }
 
     private static String center(String s, int width) {
-        return StringGroovyMethods.center(s, width);
+        //1.0 branch does not have Groovy so do centering!
+        return s;
     }
 
     private static class MimeTypeStats implements Comparable<MimeTypeStats> {

Copied: jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java (from r1690636, jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java)
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java?p2=jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java&p1=jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java&r1=1690636&r2=1691126&rev=1691126&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java (original)
+++ jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java Wed Jul 15 06:21:09 2015
@@ -32,15 +32,6 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 public class CSVFileGenerator {
-    /*
-        Instead of using the FORMAT from CSVFileBinaryResourceProvider
-        defining our own without header. Otherwise commons-csv was always
-        adding the header
-     */
-    private static final CSVFormat FORMAT = CSVFormat.DEFAULT
-            .withCommentMarker('#')
-            .withNullString("") //Empty string are considered as null
-            .withIgnoreSurroundingSpaces();
     private final Logger log = LoggerFactory.getLogger(getClass());
     private File outFile;
 
@@ -52,7 +43,8 @@ public class CSVFileGenerator {
         Closer closer = Closer.create();
         int count = 0;
         try{
-            CSVPrinter printer = new CSVPrinter(Files.newWriter(outFile, Charsets.UTF_8), FORMAT);
+            CSVPrinter printer = new CSVPrinter(Files.newWriter(outFile, Charsets.UTF_8),
+                    CSVFileBinaryResourceProvider.FORMAT);
             for (BinaryResource br : binaries){
                 count++;
                 printer.printRecord(

Modified: jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java?rev=1691126&r1=1690249&r2=1691126&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java (original)
+++ jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java Wed Jul 15 06:21:09 2015
@@ -20,6 +20,7 @@
 package org.apache.jackrabbit.oak.plugins.tika;
 
 import javax.annotation.CheckForNull;
+import javax.annotation.Nonnull;
 import javax.annotation.Nullable;
 
 import com.google.common.base.Function;
@@ -30,13 +31,14 @@ import org.apache.jackrabbit.oak.api.Blo
 import org.apache.jackrabbit.oak.api.PropertyState;
 import org.apache.jackrabbit.oak.api.Tree;
 import org.apache.jackrabbit.oak.api.Type;
+import org.apache.jackrabbit.oak.plugins.tree.ImmutableTree;
 import org.apache.jackrabbit.oak.spi.blob.BlobStore;
+import org.apache.jackrabbit.oak.spi.state.NodeState;
 import org.apache.jackrabbit.oak.spi.state.NodeStore;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import static com.google.common.base.Predicates.notNull;
-import static org.apache.jackrabbit.oak.plugins.tree.TreeFactory.createReadOnlyTree;
 import static org.apache.jackrabbit.oak.spi.state.NodeStateUtils.getNode;
 
 class NodeStoreBinaryResourceProvider implements BinaryResourceProvider {
@@ -72,7 +74,9 @@ class NodeStoreBinaryResourceProvider im
 
             Blob blob = data.getValue(Type.BINARY);
             String blobId = blob.getContentIdentity();
-            if (blobId == null) {
+            //Check for ref being non null to ensure its not an inlined binary
+            //For Segment ContentIdentity defaults to RecordId
+            if (blob.getReference() == null || blobId == null) {
                 log.debug("Ignoring jcr:data property at {} as its an inlined blob", tree.getPath());
                 return null;
             }
@@ -97,4 +101,8 @@ class NodeStoreBinaryResourceProvider im
         PropertyState prop = tree.getProperty(name);
         return prop != null ? prop.getValue(Type.STRING) : null;
     }
+
+    private static Tree createReadOnlyTree(@Nonnull NodeState rootState) {
+        return new ImmutableTree(rootState);
+    }
 }

Modified: jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java?rev=1691126&r1=1690249&r2=1691126&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java (original)
+++ jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java Wed Jul 15 06:21:09 2015
@@ -21,16 +21,26 @@ package org.apache.jackrabbit.oak.plugin
 
 import java.io.Closeable;
 import java.io.File;
+import java.io.IOException;
 import java.util.List;
 
 import com.google.common.io.Closer;
+import com.mongodb.MongoClientURI;
+import com.mongodb.MongoURI;
 import joptsimple.OptionParser;
 import joptsimple.OptionSet;
 import joptsimple.OptionSpec;
 import org.apache.jackrabbit.core.data.FileDataStore;
 import org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreBlobStore;
 import org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreTextWriter;
+import org.apache.jackrabbit.oak.plugins.document.DocumentMK;
+import org.apache.jackrabbit.oak.plugins.document.DocumentNodeStore;
+import org.apache.jackrabbit.oak.plugins.document.util.MongoConnection;
+import org.apache.jackrabbit.oak.plugins.segment.SegmentNodeStore;
+import org.apache.jackrabbit.oak.plugins.segment.file.FileStore;
+import org.apache.jackrabbit.oak.run.Main;
 import org.apache.jackrabbit.oak.spi.blob.BlobStore;
+import org.apache.jackrabbit.oak.spi.state.NodeStore;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -89,8 +99,6 @@ public class TextExtractorMain {
                     .withRequiredArg()
                     .ofType(Integer.class);
 
-            //TODO implement generate support
-
             OptionSpec<String> nonOption = parser.nonOptions(h);
 
             OptionSet options = parser.parse(args);
@@ -108,7 +116,8 @@ public class TextExtractorMain {
 
             boolean report = nonOptions.contains("report");
             boolean extract = nonOptions.contains("extract");
-            File dataFile;
+            boolean generate = nonOptions.contains("generate");
+            File dataFile = null;
             File fdsDir;
             File storeDir = null;
             File tikaConfigFile = null;
@@ -142,23 +151,35 @@ public class TextExtractorMain {
 
             if (options.has(dataFileSpec)) {
                 dataFile = dataFileSpec.value(options);
-                checkArgument(dataFile.exists(), "Data file %s does not exist", dataFile.getAbsolutePath());
-                binaryResourceProvider = new CSVFileBinaryResourceProvider(dataFile, blobStore);
             }
 
-            if (binaryResourceProvider instanceof Closeable) {
-                closer.register((Closeable) binaryResourceProvider);
-            }
+            checkNotNull(dataFile, "Data file not configured with %s", dataFileSpec);
 
             if (report || extract) {
-                checkNotNull(binaryResourceProvider, "BinaryProvider source must be specified either " +
-                        "via '%s' or '%s", dataFileSpec.options(), nodeStoreSpec.options());
+                checkArgument(dataFile.exists(),
+                        "Data file %s does not exist", dataFile.getAbsolutePath());
+
+                binaryResourceProvider = new CSVFileBinaryResourceProvider(dataFile, blobStore);
+                if (binaryResourceProvider instanceof Closeable) {
+                    closer.register((Closeable) binaryResourceProvider);
+                }
 
                 stats = new BinaryStats(tikaConfigFile, binaryResourceProvider);
                 String summary = stats.getSummary();
                 log.info(summary);
             }
 
+            if (generate){
+                String src = nodeStoreSpec.value(options);
+                checkNotNull(blobStore, "BlobStore found to be null. FileDataStore directory " +
+                        "must be specified via %s", fdsDirSpec.options());
+                checkNotNull(dataFile, "Data file path not provided");
+                NodeStore nodeStore = bootStrapNodeStore(src, blobStore, closer);
+                BinaryResourceProvider brp = new NodeStoreBinaryResourceProvider(nodeStore, blobStore);
+                CSVFileGenerator generator = new CSVFileGenerator(dataFile);
+                generator.generate(brp.getBinaries(path));
+            }
+
             if (extract) {
                 checkNotNull(storeDir, "Directory to store extracted text content " +
                         "must be specified via %s", storeDirSpec.options());
@@ -197,4 +218,56 @@ public class TextExtractorMain {
             closer.close();
         }
     }
+
+    private static NodeStore bootStrapNodeStore(String src, BlobStore blobStore,
+                                                Closer closer) throws IOException {
+        if (src.startsWith(MongoURI.MONGODB_PREFIX)) {
+            MongoClientURI uri = new MongoClientURI(src);
+            if (uri.getDatabase() == null) {
+                System.err.println("Database missing in MongoDB URI: "
+                        + uri.getURI());
+                System.exit(1);
+            }
+            MongoConnection mongo = new MongoConnection(uri.getURI());
+            closer.register(asCloseable(mongo));
+            DocumentNodeStore store = new DocumentMK.Builder()
+                    .setBlobStore(blobStore)
+                    .setMongoDB(mongo.getDB()).getNodeStore();
+            closer.register(asCloseable(store));
+            return store;
+        }
+        FileStore fs = FileStore.newFileStore(new File(src))
+                .withBlobStore(blobStore)
+                .withMemoryMapping(Main.TAR_STORAGE_MEMORY_MAPPED)
+                .create();
+        closer.register(asCloseable(fs));
+        return new SegmentNodeStore(fs);
+    }
+
+    private static Closeable asCloseable(final FileStore fs) {
+        return new Closeable() {
+            @Override
+            public void close() throws IOException {
+                fs.close();
+            }
+        };
+    }
+
+    private static Closeable asCloseable(final DocumentNodeStore dns) {
+        return new Closeable() {
+            @Override
+            public void close() throws IOException {
+                dns.dispose();
+            }
+        };
+    }
+
+    private static Closeable asCloseable(final MongoConnection con) {
+        return new Closeable() {
+            @Override
+            public void close() throws IOException {
+                con.close();
+            }
+        };
+    }
 }

Modified: jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java?rev=1691126&r1=1691125&r2=1691126&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java (original)
+++ jackrabbit/oak/branches/1.0/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java Wed Jul 15 06:21:09 2015
@@ -74,6 +74,7 @@ import org.apache.jackrabbit.oak.plugins
 import org.apache.jackrabbit.oak.plugins.segment.file.FileStore;
 import org.apache.jackrabbit.oak.plugins.segment.standby.client.StandbyClient;
 import org.apache.jackrabbit.oak.plugins.segment.standby.server.StandbyServer;
+import org.apache.jackrabbit.oak.plugins.tika.TextExtractorMain;
 import org.apache.jackrabbit.oak.spi.state.NodeState;
 import org.apache.jackrabbit.oak.spi.state.NodeStore;
 import org.apache.jackrabbit.oak.upgrade.RepositoryUpgrade;
@@ -147,6 +148,9 @@ public class Main {
             case REPAIR:
                 repair(args);
                 break;
+            case TIKA:
+                TextExtractorMain.main(args);
+                break;
             default:
                 System.err.println("Unknown command: " + mode);
                 System.exit(1);
@@ -882,7 +886,8 @@ public class Main {
         PRIMARY("primary"),
         STANDBY("standy"),
         CHECKPOINTS("checkpoints"),
-        REPAIR("repair");
+        REPAIR("repair"),
+        TIKA("tika");
 
         private final String name;
 

Modified: jackrabbit/oak/branches/1.0/oak-run/src/main/resources/logback.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.0/oak-run/src/main/resources/logback.xml?rev=1691126&r1=1691125&r2=1691126&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.0/oak-run/src/main/resources/logback.xml (original)
+++ jackrabbit/oak/branches/1.0/oak-run/src/main/resources/logback.xml Wed Jul 15 06:21:09 2015
@@ -30,6 +30,8 @@
   <!-- Display info messages from the benchmark suite -->
   <logger name="org.apache.jackrabbit.oak.benchmark" level="INFO"/>
 
+  <logger name="org.apache.jackrabbit.oak.plugins.tika" level="INFO"/>
+  
   <root level="warn">
     <appender-ref ref="STDERR" />
   </root>

Modified: jackrabbit/oak/branches/1.0/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.0/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java?rev=1691126&r1=1690249&r2=1691126&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.0/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java (original)
+++ jackrabbit/oak/branches/1.0/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java Wed Jul 15 06:21:09 2015
@@ -19,8 +19,13 @@
 
 package org.apache.jackrabbit.oak.plugins.tika;
 
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+
 import org.apache.jackrabbit.JcrConstants;
 import org.apache.jackrabbit.oak.api.Blob;
+import org.apache.jackrabbit.oak.plugins.blob.BlobStoreBlob;
 import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob;
 import org.apache.jackrabbit.oak.plugins.memory.MemoryNodeStore;
 import org.apache.jackrabbit.oak.spi.blob.BlobStore;
@@ -28,13 +33,18 @@ import org.apache.jackrabbit.oak.spi.blo
 import org.apache.jackrabbit.oak.spi.state.NodeBuilder;
 import org.apache.jackrabbit.oak.spi.state.NodeState;
 import org.apache.jackrabbit.oak.spi.state.NodeStore;
+import org.junit.Rule;
 import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
 
 import static org.apache.jackrabbit.JcrConstants.JCR_CONTENT;
 import static org.apache.jackrabbit.oak.plugins.nodetype.write.InitialContent.INITIAL_CONTENT;
 import static org.junit.Assert.assertEquals;
 
 public class NodeStoreBinaryResourceProviderTest {
+    @Rule
+    public final TemporaryFolder temporaryFolder = new TemporaryFolder();
+
     private NodeState root = INITIAL_CONTENT;
 
     @Test
@@ -57,6 +67,24 @@ public class NodeStoreBinaryResourceProv
         assertEquals("text/foo", bs.getMimeType());
         assertEquals("bar", bs.getEncoding());
         assertEquals("id2", bs.getBlobId());
+    }
+
+    @Test
+    public void csvGenerator() throws Exception {
+        File csv = new File(temporaryFolder.getRoot(), "test.csv");
+        BlobStore blobStore = new MemoryBlobStore();
+        NodeBuilder builder = root.builder();
+        createFileNode(builder, "a", blobOf("foo", blobStore), "text/plain");
+        createFileNode(builder, "b", blobOf("hello", blobStore), "text/plain");
+
+        NodeStore store = new MemoryNodeStore(builder.getNodeState());
+
+        NodeStoreBinaryResourceProvider extractor = new NodeStoreBinaryResourceProvider(store, blobStore);
+        CSVFileGenerator generator = new CSVFileGenerator(csv);
+        generator.generate(extractor.getBinaries("/"));
+
+        CSVFileBinaryResourceProvider csvbrp = new CSVFileBinaryResourceProvider(csv, blobStore);
+        assertEquals(2, csvbrp.getBinaries("/").size());
 
     }
 
@@ -67,6 +95,11 @@ public class NodeStoreBinaryResourceProv
         return jcrContent;
     }
 
+    private Blob blobOf(String content, BlobStore bs) throws IOException {
+        String id = bs.writeBlob(new ByteArrayInputStream(content.getBytes()));
+        return new BlobStoreBlob(bs, id);
+    }
+
     private static class IdBlob extends ArrayBasedBlob {
         final String id;
 
@@ -76,6 +109,11 @@ public class NodeStoreBinaryResourceProv
         }
 
         @Override
+        public String getReference() {
+            return id;
+        }
+
+        @Override
         public String getContentIdentity() {
             return id;
         }