You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@oozie.apache.org by Idris Ali <ps...@gmail.com> on 2012/02/14 06:32:02 UTC

Dataset URI resolution with regex pattern

Hi,

I have a coordinator XML, where the Dataset "inPath" should wait on base
directory:
 <uri-template>${nameNode}/projects/bi/ioout/${YEAR}-${MONTH}-${DAY}-${HOUR}</uri-template>

However while passing this to MR job I need to append a Regex Pattern:
                <property>
                    <name>inPath</name>

<value>${coord:dataIn('inPath-event')}/clickon/*/*/OK/*/*</value>
                </property>

But oozie seems to append this path only to the last resolved dataset event
and not to all, how do I define the coordinator so that every resolved
dataset event has the regex pattern? Any help is appreciated or else we
might need to create custom EL function.

Here is my coordinator.xml:

<coordinator-app name="rm-coord" frequency="${coord:days(1)}"
start="${start}" end="${end}" timezone="UTC"
                 xmlns="uri:oozie:coordinator:0.2">
    <controls>
        <concurrency>1</concurrency>
    </controls>

    <datasets>
        <dataset name="inPath" frequency="${coord:hours(1)}"
initial-instance="2012-01-30T00:00Z" timezone="UTC">

<uri-template>${nameNode}/projects/bi/ioout/${YEAR}-${MONTH}-${DAY}-${HOUR}</uri-template>
        <done-flag></done-flag>
        </dataset>
        <dataset name="interPath" frequency="${coord:days(1)}"
initial-instance="2012-01-30T01:00Z" timezone="UTC">

<uri-template>/projects/bi/ioout/interactions/${YEAR}-${MONTH}-${DAY}</uri-template>
    <done-flag></done-flag>
        </dataset>
        <dataset name="outputhdfsdir" frequency="${coord:days(1)}"
initial-instance="2012-01-31T01:00Z" timezone="UTC">

<uri-template>/projects/bi/rmc/daily/AdvInteractionSummary</uri-template>
        <done-flag></done-flag>
        </dataset>
    </datasets>

    <input-events>
        <data-in name="inPath-event" dataset="inPath">
            <start-instance>${coord:current(-26)}</start-instance>
            <end-instance>${coord:current(0)}</end-instance>
        </data-in>
    </input-events>
    <output-events>
        <data-out name="interPath-event" dataset="interPath">
            <instance>${coord:current(-1)}</instance>
        </data-out>
        <data-out name="outputhdfsdir-event" dataset="outputhdfsdir">
            <instance>${coord:current(0)}</instance>
        </data-out>
    </output-events>

    <action>
        <workflow>
            <app-path>${nameNode}/user/${coord:user()}/apps/rm</app-path>
            <configuration>
                <property>
                    <name>jobTracker</name>
                    <value>${jobTracker}</value>
                </property>
                <property>
                    <name>nameNode</name>
                    <value>${nameNode}</value>
                </property>
                <property>
                    <name>queueName</name>
                    <value>${queueName}</value>
                </property>
                <property>
                    <name>logTime</name>
                    <value>${coord:formatTime(coord:actualTime(),
'yyyy-MM-dd')}</value>
                </property>
                <property>
                    <name>inPath</name>

<value>${coord:dataIn('inPath-event')}/clickon/*/*/OK/*/*</value>
                </property>
                <property>
                    <name>interPath</name>
                    <value>${coord:dataOut('interPath-event')}</value>
                </property>
                <property>
                    <name>outputhdfsdir</name>
                    <value>${coord:dataOut('outputhdfsdir-event')}</value>
                </property>
            </configuration>
        </workflow>
    </action>
</coordinator-app>


And this is how the resolved Workflow looks like:

<java xmlns="uri:oozie:workflow:0.2">
  <job-tracker>shaik-idris:8021</job-tracker>
  <name-node>hdfs://shaik-idris:8020</name-node>
  <configuration>
    <property>
      <name>mapred.job.queue.name</name>
      <value>default</value>
    </property>
  </configuration>

<main-class>com.inmobi.grid.richmedia.aggregation.InteractionAggregation</main-class>
  <arg>-Dlog.time=2012-02-12</arg>

<arg>-Dlog.in.path=hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-31-01,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-31-00,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-23,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-22,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-21,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-20,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-19,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-18,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-17,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-16,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-15,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-14,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-13,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-12,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-11,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-10,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-09,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-08,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-07,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-06,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-05,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-04,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-03,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-02,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-01,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-00/clickon/*/*/OK/*/*</arg>
  <arg>-Dout.path=/projects/bi/ioout/interactions/2012-01-30</arg>

<arg>-Dtmpjars=hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/rm-reports-0.1.jar,hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/io-lib-0.3-SNAPSHOT.jar,hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/libthrift-0.5.0-cdh.jar,hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/hive-exec-0.7.0-cdh3u0.jar</arg>
  <file>/user/shaik.idris/apps/rm/lib/rm-reports-0.1.jar</file>
  <file>/user/shaik.idris/apps/rm/lib/io-lib-0.3-SNAPSHOT.jar</file>
  <file>/user/shaik.idris/apps/rm/lib/libthrift-0.5.0-cdh.jar</file>
  <file>/user/shaik.idris/apps/rm/lib/hive-exec-0.7.0-cdh3u0.jar</file>
  <capture-output />
</java>

Thanks,
-Idris

Re: Dataset URI resolution with regex pattern

Posted by Idris Ali <ps...@gmail.com>.
Hi Mohammad,

I have defined the full use-case and opend a JIRA for the same.
https://issues.apache.org/jira/browse/OOZIE-699

Thanks,
-Idris

On Wed, Feb 15, 2012 at 1:40 AM, Mohammad Islam <mi...@yahoo.com> wrote:

> So you don't have input dependency but you want to pass the directory.
>
> In that case, I would ask you consider the dateOffset and formatTime EL
> function together by-passing all input-event and data set.
> For example, this is your expected template: /a/b/*/c/d/TIMESTAMP/e/*/f.
> You could use something like this for current(-1) like timestamp:
>  /a/b/*/c/d/coord:formatTime(coord:dateOffset(coord.nominalTime(), -1,
> 'DAY'), "your-format")/e/*/f
>
>
> However there could still be something missing. In that case, you could
> create a JIRA that fills up that gap.
>
>
> http://yahoo.github.com/oozie/releases/3.1.0/CoordinatorFunctionalSpec.html#a6.8._Parameterization_of_Coordinator_Application
>
> Regards,
> Mohammad
>
>
> ----- Original Message -----
> From: Idris Ali <ps...@gmail.com>
> To: oozie-users@incubator.apache.org; Mohammad Islam <mi...@yahoo.com>
> Cc:
> Sent: Tuesday, February 14, 2012 4:36 AM
> Subject: Re: Dataset URI resolution with regex pattern
>
> Hi Mohammad,
>
> This makes sense, but I was looking for some out-of-the-box oozie solution,
> if somehow I can specify oozie not to Gate(wait) on the URI, but still be
> able to generate the events, say there is no done flag (neither _SUCCESS
> nor Directory or any file).
>
> One way is if we specify it as output event and pass this as property to
> oozie, then there is no gating, but output event can have only have one
> instance unlike start and end instance.
>
> Thanks again,
> -Idris
>
> On Tue, Feb 14, 2012 at 2:44 PM, Mohammad Islam <mi...@yahoo.com>
> wrote:
>
> > Hi Idris,
> > One possible way is that you can write a Java action before your MR
> action
> > in WF.xml.
> > Coordinator will pass two properties: one for all the paths, another for
> > appender.
> > These two properties will be passed to the java action. Java action will
> > parse the first and append the appender on each path. At last, using
> > capture-output, java action will send back the modified path list. At
> last,
> > use the java action passed value as the input path to MR action.
> > If something is not clear or need more specific help, please feel free to
> > let us know.
> >
> > Regards,
> > Mohammad
> >
> >
> > ________________________________
> > From: Idris Ali <ps...@gmail.com>
> > To: oozie-users@incubator.apache.org
> > Sent: Monday, February 13, 2012 9:32 PM
> > Subject: Dataset URI resolution with regex pattern
> >
> > Hi,
> >
> > I have a coordinator XML, where the Dataset "inPath" should wait on base
> > directory:
> >
> >
> <uri-template>${nameNode}/projects/bi/ioout/${YEAR}-${MONTH}-${DAY}-${HOUR}</uri-template>
> >
> > However while passing this to MR job I need to append a Regex Pattern:
> >                 <property>
> >                     <name>inPath</name>
> >
> > <value>${coord:dataIn('inPath-event')}/clickon/*/*/OK/*/*</value>
> >                 </property>
> >
> > But oozie seems to append this path only to the last resolved dataset
> event
> > and not to all, how do I define the coordinator so that every resolved
> > dataset event has the regex pattern? Any help is appreciated or else we
> > might need to create custom EL function.
> >
> > Here is my coordinator.xml:
> >
> > <coordinator-app name="rm-coord" frequency="${coord:days(1)}"
> > start="${start}" end="${end}" timezone="UTC"
> >                  xmlns="uri:oozie:coordinator:0.2">
> >     <controls>
> >         <concurrency>1</concurrency>
> >     </controls>
> >
> >     <datasets>
> >         <dataset name="inPath" frequency="${coord:hours(1)}"
> > initial-instance="2012-01-30T00:00Z" timezone="UTC">
> >
> >
> >
> <uri-template>${nameNode}/projects/bi/ioout/${YEAR}-${MONTH}-${DAY}-${HOUR}</uri-template>
> >         <done-flag></done-flag>
> >         </dataset>
> >         <dataset name="interPath" frequency="${coord:days(1)}"
> > initial-instance="2012-01-30T01:00Z" timezone="UTC">
> >
> >
> >
> <uri-template>/projects/bi/ioout/interactions/${YEAR}-${MONTH}-${DAY}</uri-template>
> >     <done-flag></done-flag>
> >         </dataset>
> >         <dataset name="outputhdfsdir" frequency="${coord:days(1)}"
> > initial-instance="2012-01-31T01:00Z" timezone="UTC">
> >
> > <uri-template>/projects/bi/rmc/daily/AdvInteractionSummary</uri-template>
> >         <done-flag></done-flag>
> >         </dataset>
> >     </datasets>
> >
> >     <input-events>
> >         <data-in name="inPath-event" dataset="inPath">
> >             <start-instance>${coord:current(-26)}</start-instance>
> >             <end-instance>${coord:current(0)}</end-instance>
> >         </data-in>
> >     </input-events>
> >     <output-events>
> >         <data-out name="interPath-event" dataset="interPath">
> >             <instance>${coord:current(-1)}</instance>
> >         </data-out>
> >         <data-out name="outputhdfsdir-event" dataset="outputhdfsdir">
> >             <instance>${coord:current(0)}</instance>
> >         </data-out>
> >     </output-events>
> >
> >     <action>
> >         <workflow>
> >             <app-path>${nameNode}/user/${coord:user()}/apps/rm</app-path>
> >             <configuration>
> >                 <property>
> >                     <name>jobTracker</name>
> >                     <value>${jobTracker}</value>
> >                 </property>
> >                 <property>
> >                     <name>nameNode</name>
> >                     <value>${nameNode}</value>
> >                 </property>
> >                 <property>
> >                     <name>queueName</name>
> >                     <value>${queueName}</value>
> >                 </property>
> >                 <property>
> >                     <name>logTime</name>
> >                     <value>${coord:formatTime(coord:actualTime(),
> > 'yyyy-MM-dd')}</value>
> >                 </property>
> >                 <property>
> >                     <name>inPath</name>
> >
> > <value>${coord:dataIn('inPath-event')}/clickon/*/*/OK/*/*</value>
> >                 </property>
> >                 <property>
> >                     <name>interPath</name>
> >                     <value>${coord:dataOut('interPath-event')}</value>
> >                 </property>
> >                 <property>
> >                     <name>outputhdfsdir</name>
> >
>  <value>${coord:dataOut('outputhdfsdir-event')}</value>
> >                 </property>
> >             </configuration>
> >         </workflow>
> >     </action>
> > </coordinator-app>
> >
> >
> > And this is how the resolved Workflow looks like:
> >
> > <java xmlns="uri:oozie:workflow:0.2">
> >   <job-tracker>shaik-idris:8021</job-tracker>
> >   <name-node>hdfs://shaik-idris:8020</name-node>
> >   <configuration>
> >     <property>
> >       <name>mapred.job.queue.name</name>
> >       <value>default</value>
> >     </property>
> >   </configuration>
> >
> >
> >
> <main-class>com.inmobi.grid.richmedia.aggregation.InteractionAggregation</main-class>
> >   <arg>-Dlog.time=2012-02-12</arg>
> >
> > <arg>-Dlog.in.path=hdfs://shaik-idris:8020/projects/bi/ioout/
> 2012-01-31-01
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-31-00
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-23
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-22
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-21
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-20
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-19
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-18
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-17
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-16
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-15
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-14
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-13
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-12
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-11
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-10
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30
> > -09,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-08
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-07
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-06
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-05
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-04
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-03
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-02
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-01
> > ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-00
> > /clickon/*/*/OK/*/*</arg>
> >   <arg>-Dout.path=/projects/bi/ioout/interactions/2012-01-30</arg>
> >
> >
> >
> <arg>-Dtmpjars=hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/rm-reports-0.1.jar,hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/io-lib-0.3-SNAPSHOT.jar,hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/libthrift-0.5.0-cdh.jar,hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/hive-exec-0.7.0-cdh3u0.jar</arg>
> >   <file>/user/shaik.idris/apps/rm/lib/rm-reports-0.1.jar</file>
> >   <file>/user/shaik.idris/apps/rm/lib/io-lib-0.3-SNAPSHOT.jar</file>
> >   <file>/user/shaik.idris/apps/rm/lib/libthrift-0.5.0-cdh.jar</file>
> >   <file>/user/shaik.idris/apps/rm/lib/hive-exec-0.7.0-cdh3u0.jar</file>
> >   <capture-output />
> > </java>
> >
> > Thanks,
> > -Idris
> >
>
>

Re: Dataset URI resolution with regex pattern

Posted by Mohammad Islam <mi...@yahoo.com>.
So you don't have input dependency but you want to pass the directory.

In that case, I would ask you consider the dateOffset and formatTime EL function together by-passing all input-event and data set.
For example, this is your expected template: /a/b/*/c/d/TIMESTAMP/e/*/f.
You could use something like this for current(-1) like timestamp:
 /a/b/*/c/d/coord:formatTime(coord:dateOffset(coord.nominalTime(), -1, 'DAY'), "your-format")/e/*/f

 
However there could still be something missing. In that case, you could create a JIRA that fills up that gap.
  
http://yahoo.github.com/oozie/releases/3.1.0/CoordinatorFunctionalSpec.html#a6.8._Parameterization_of_Coordinator_Application
 
Regards,
Mohammad


----- Original Message -----
From: Idris Ali <ps...@gmail.com>
To: oozie-users@incubator.apache.org; Mohammad Islam <mi...@yahoo.com>
Cc: 
Sent: Tuesday, February 14, 2012 4:36 AM
Subject: Re: Dataset URI resolution with regex pattern

Hi Mohammad,

This makes sense, but I was looking for some out-of-the-box oozie solution,
if somehow I can specify oozie not to Gate(wait) on the URI, but still be
able to generate the events, say there is no done flag (neither _SUCCESS
nor Directory or any file).

One way is if we specify it as output event and pass this as property to
oozie, then there is no gating, but output event can have only have one
instance unlike start and end instance.

Thanks again,
-Idris

On Tue, Feb 14, 2012 at 2:44 PM, Mohammad Islam <mi...@yahoo.com> wrote:

> Hi Idris,
> One possible way is that you can write a Java action before your MR action
> in WF.xml.
> Coordinator will pass two properties: one for all the paths, another for
> appender.
> These two properties will be passed to the java action. Java action will
> parse the first and append the appender on each path. At last, using
> capture-output, java action will send back the modified path list. At last,
> use the java action passed value as the input path to MR action.
> If something is not clear or need more specific help, please feel free to
> let us know.
>
> Regards,
> Mohammad
>
>
> ________________________________
> From: Idris Ali <ps...@gmail.com>
> To: oozie-users@incubator.apache.org
> Sent: Monday, February 13, 2012 9:32 PM
> Subject: Dataset URI resolution with regex pattern
>
> Hi,
>
> I have a coordinator XML, where the Dataset "inPath" should wait on base
> directory:
>
> <uri-template>${nameNode}/projects/bi/ioout/${YEAR}-${MONTH}-${DAY}-${HOUR}</uri-template>
>
> However while passing this to MR job I need to append a Regex Pattern:
>                 <property>
>                     <name>inPath</name>
>
> <value>${coord:dataIn('inPath-event')}/clickon/*/*/OK/*/*</value>
>                 </property>
>
> But oozie seems to append this path only to the last resolved dataset event
> and not to all, how do I define the coordinator so that every resolved
> dataset event has the regex pattern? Any help is appreciated or else we
> might need to create custom EL function.
>
> Here is my coordinator.xml:
>
> <coordinator-app name="rm-coord" frequency="${coord:days(1)}"
> start="${start}" end="${end}" timezone="UTC"
>                  xmlns="uri:oozie:coordinator:0.2">
>     <controls>
>         <concurrency>1</concurrency>
>     </controls>
>
>     <datasets>
>         <dataset name="inPath" frequency="${coord:hours(1)}"
> initial-instance="2012-01-30T00:00Z" timezone="UTC">
>
>
> <uri-template>${nameNode}/projects/bi/ioout/${YEAR}-${MONTH}-${DAY}-${HOUR}</uri-template>
>         <done-flag></done-flag>
>         </dataset>
>         <dataset name="interPath" frequency="${coord:days(1)}"
> initial-instance="2012-01-30T01:00Z" timezone="UTC">
>
>
> <uri-template>/projects/bi/ioout/interactions/${YEAR}-${MONTH}-${DAY}</uri-template>
>     <done-flag></done-flag>
>         </dataset>
>         <dataset name="outputhdfsdir" frequency="${coord:days(1)}"
> initial-instance="2012-01-31T01:00Z" timezone="UTC">
>
> <uri-template>/projects/bi/rmc/daily/AdvInteractionSummary</uri-template>
>         <done-flag></done-flag>
>         </dataset>
>     </datasets>
>
>     <input-events>
>         <data-in name="inPath-event" dataset="inPath">
>             <start-instance>${coord:current(-26)}</start-instance>
>             <end-instance>${coord:current(0)}</end-instance>
>         </data-in>
>     </input-events>
>     <output-events>
>         <data-out name="interPath-event" dataset="interPath">
>             <instance>${coord:current(-1)}</instance>
>         </data-out>
>         <data-out name="outputhdfsdir-event" dataset="outputhdfsdir">
>             <instance>${coord:current(0)}</instance>
>         </data-out>
>     </output-events>
>
>     <action>
>         <workflow>
>             <app-path>${nameNode}/user/${coord:user()}/apps/rm</app-path>
>             <configuration>
>                 <property>
>                     <name>jobTracker</name>
>                     <value>${jobTracker}</value>
>                 </property>
>                 <property>
>                     <name>nameNode</name>
>                     <value>${nameNode}</value>
>                 </property>
>                 <property>
>                     <name>queueName</name>
>                     <value>${queueName}</value>
>                 </property>
>                 <property>
>                     <name>logTime</name>
>                     <value>${coord:formatTime(coord:actualTime(),
> 'yyyy-MM-dd')}</value>
>                 </property>
>                 <property>
>                     <name>inPath</name>
>
> <value>${coord:dataIn('inPath-event')}/clickon/*/*/OK/*/*</value>
>                 </property>
>                 <property>
>                     <name>interPath</name>
>                     <value>${coord:dataOut('interPath-event')}</value>
>                 </property>
>                 <property>
>                     <name>outputhdfsdir</name>
>                     <value>${coord:dataOut('outputhdfsdir-event')}</value>
>                 </property>
>             </configuration>
>         </workflow>
>     </action>
> </coordinator-app>
>
>
> And this is how the resolved Workflow looks like:
>
> <java xmlns="uri:oozie:workflow:0.2">
>   <job-tracker>shaik-idris:8021</job-tracker>
>   <name-node>hdfs://shaik-idris:8020</name-node>
>   <configuration>
>     <property>
>       <name>mapred.job.queue.name</name>
>       <value>default</value>
>     </property>
>   </configuration>
>
>
> <main-class>com.inmobi.grid.richmedia.aggregation.InteractionAggregation</main-class>
>   <arg>-Dlog.time=2012-02-12</arg>
>
> <arg>-Dlog.in.path=hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-31-01
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-31-00
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-23
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-22
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-21
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-20
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-19
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-18
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-17
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-16
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-15
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-14
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-13
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-12
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-11
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-10
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30
> -09,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-08
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-07
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-06
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-05
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-04
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-03
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-02
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-01
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-00
> /clickon/*/*/OK/*/*</arg>
>   <arg>-Dout.path=/projects/bi/ioout/interactions/2012-01-30</arg>
>
>
> <arg>-Dtmpjars=hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/rm-reports-0.1.jar,hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/io-lib-0.3-SNAPSHOT.jar,hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/libthrift-0.5.0-cdh.jar,hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/hive-exec-0.7.0-cdh3u0.jar</arg>
>   <file>/user/shaik.idris/apps/rm/lib/rm-reports-0.1.jar</file>
>   <file>/user/shaik.idris/apps/rm/lib/io-lib-0.3-SNAPSHOT.jar</file>
>   <file>/user/shaik.idris/apps/rm/lib/libthrift-0.5.0-cdh.jar</file>
>   <file>/user/shaik.idris/apps/rm/lib/hive-exec-0.7.0-cdh3u0.jar</file>
>   <capture-output />
> </java>
>
> Thanks,
> -Idris
>


Re: Dataset URI resolution with regex pattern

Posted by Idris Ali <ps...@gmail.com>.
Hi Mohammad,

This makes sense, but I was looking for some out-of-the-box oozie solution,
if somehow I can specify oozie not to Gate(wait) on the URI, but still be
able to generate the events, say there is no done flag (neither _SUCCESS
nor Directory or any file).

One way is if we specify it as output event and pass this as property to
oozie, then there is no gating, but output event can have only have one
instance unlike start and end instance.

Thanks again,
-Idris

On Tue, Feb 14, 2012 at 2:44 PM, Mohammad Islam <mi...@yahoo.com> wrote:

> Hi Idris,
> One possible way is that you can write a Java action before your MR action
> in WF.xml.
> Coordinator will pass two properties: one for all the paths, another for
> appender.
> These two properties will be passed to the java action. Java action will
> parse the first and append the appender on each path. At last, using
> capture-output, java action will send back the modified path list. At last,
> use the java action passed value as the input path to MR action.
> If something is not clear or need more specific help, please feel free to
> let us know.
>
> Regards,
> Mohammad
>
>
> ________________________________
> From: Idris Ali <ps...@gmail.com>
> To: oozie-users@incubator.apache.org
> Sent: Monday, February 13, 2012 9:32 PM
> Subject: Dataset URI resolution with regex pattern
>
> Hi,
>
> I have a coordinator XML, where the Dataset "inPath" should wait on base
> directory:
>
> <uri-template>${nameNode}/projects/bi/ioout/${YEAR}-${MONTH}-${DAY}-${HOUR}</uri-template>
>
> However while passing this to MR job I need to append a Regex Pattern:
>                 <property>
>                     <name>inPath</name>
>
> <value>${coord:dataIn('inPath-event')}/clickon/*/*/OK/*/*</value>
>                 </property>
>
> But oozie seems to append this path only to the last resolved dataset event
> and not to all, how do I define the coordinator so that every resolved
> dataset event has the regex pattern? Any help is appreciated or else we
> might need to create custom EL function.
>
> Here is my coordinator.xml:
>
> <coordinator-app name="rm-coord" frequency="${coord:days(1)}"
> start="${start}" end="${end}" timezone="UTC"
>                  xmlns="uri:oozie:coordinator:0.2">
>     <controls>
>         <concurrency>1</concurrency>
>     </controls>
>
>     <datasets>
>         <dataset name="inPath" frequency="${coord:hours(1)}"
> initial-instance="2012-01-30T00:00Z" timezone="UTC">
>
>
> <uri-template>${nameNode}/projects/bi/ioout/${YEAR}-${MONTH}-${DAY}-${HOUR}</uri-template>
>         <done-flag></done-flag>
>         </dataset>
>         <dataset name="interPath" frequency="${coord:days(1)}"
> initial-instance="2012-01-30T01:00Z" timezone="UTC">
>
>
> <uri-template>/projects/bi/ioout/interactions/${YEAR}-${MONTH}-${DAY}</uri-template>
>     <done-flag></done-flag>
>         </dataset>
>         <dataset name="outputhdfsdir" frequency="${coord:days(1)}"
> initial-instance="2012-01-31T01:00Z" timezone="UTC">
>
> <uri-template>/projects/bi/rmc/daily/AdvInteractionSummary</uri-template>
>         <done-flag></done-flag>
>         </dataset>
>     </datasets>
>
>     <input-events>
>         <data-in name="inPath-event" dataset="inPath">
>             <start-instance>${coord:current(-26)}</start-instance>
>             <end-instance>${coord:current(0)}</end-instance>
>         </data-in>
>     </input-events>
>     <output-events>
>         <data-out name="interPath-event" dataset="interPath">
>             <instance>${coord:current(-1)}</instance>
>         </data-out>
>         <data-out name="outputhdfsdir-event" dataset="outputhdfsdir">
>             <instance>${coord:current(0)}</instance>
>         </data-out>
>     </output-events>
>
>     <action>
>         <workflow>
>             <app-path>${nameNode}/user/${coord:user()}/apps/rm</app-path>
>             <configuration>
>                 <property>
>                     <name>jobTracker</name>
>                     <value>${jobTracker}</value>
>                 </property>
>                 <property>
>                     <name>nameNode</name>
>                     <value>${nameNode}</value>
>                 </property>
>                 <property>
>                     <name>queueName</name>
>                     <value>${queueName}</value>
>                 </property>
>                 <property>
>                     <name>logTime</name>
>                     <value>${coord:formatTime(coord:actualTime(),
> 'yyyy-MM-dd')}</value>
>                 </property>
>                 <property>
>                     <name>inPath</name>
>
> <value>${coord:dataIn('inPath-event')}/clickon/*/*/OK/*/*</value>
>                 </property>
>                 <property>
>                     <name>interPath</name>
>                     <value>${coord:dataOut('interPath-event')}</value>
>                 </property>
>                 <property>
>                     <name>outputhdfsdir</name>
>                     <value>${coord:dataOut('outputhdfsdir-event')}</value>
>                 </property>
>             </configuration>
>         </workflow>
>     </action>
> </coordinator-app>
>
>
> And this is how the resolved Workflow looks like:
>
> <java xmlns="uri:oozie:workflow:0.2">
>   <job-tracker>shaik-idris:8021</job-tracker>
>   <name-node>hdfs://shaik-idris:8020</name-node>
>   <configuration>
>     <property>
>       <name>mapred.job.queue.name</name>
>       <value>default</value>
>     </property>
>   </configuration>
>
>
> <main-class>com.inmobi.grid.richmedia.aggregation.InteractionAggregation</main-class>
>   <arg>-Dlog.time=2012-02-12</arg>
>
> <arg>-Dlog.in.path=hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-31-01
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-31-00
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-23
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-22
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-21
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-20
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-19
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-18
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-17
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-16
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-15
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-14
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-13
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-12
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-11
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-10
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30
> -09,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-08
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-07
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-06
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-05
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-04
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-03
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-02
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-01
> ,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-00
> /clickon/*/*/OK/*/*</arg>
>   <arg>-Dout.path=/projects/bi/ioout/interactions/2012-01-30</arg>
>
>
> <arg>-Dtmpjars=hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/rm-reports-0.1.jar,hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/io-lib-0.3-SNAPSHOT.jar,hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/libthrift-0.5.0-cdh.jar,hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/hive-exec-0.7.0-cdh3u0.jar</arg>
>   <file>/user/shaik.idris/apps/rm/lib/rm-reports-0.1.jar</file>
>   <file>/user/shaik.idris/apps/rm/lib/io-lib-0.3-SNAPSHOT.jar</file>
>   <file>/user/shaik.idris/apps/rm/lib/libthrift-0.5.0-cdh.jar</file>
>   <file>/user/shaik.idris/apps/rm/lib/hive-exec-0.7.0-cdh3u0.jar</file>
>   <capture-output />
> </java>
>
> Thanks,
> -Idris
>

Re: Dataset URI resolution with regex pattern

Posted by Mohammad Islam <mi...@yahoo.com>.
Hi Idris,
One possible way is that you can write a Java action before your MR action in WF.xml.
Coordinator will pass two properties: one for all the paths, another for appender.
These two properties will be passed to the java action. Java action will parse the first and append the appender on each path. At last, using capture-output, java action will send back the modified path list. At last, use the java action passed value as the input path to MR action.
If something is not clear or need more specific help, please feel free to let us know.

Regards,
Mohammad


________________________________
From: Idris Ali <ps...@gmail.com>
To: oozie-users@incubator.apache.org 
Sent: Monday, February 13, 2012 9:32 PM
Subject: Dataset URI resolution with regex pattern

Hi,

I have a coordinator XML, where the Dataset "inPath" should wait on base
directory:
<uri-template>${nameNode}/projects/bi/ioout/${YEAR}-${MONTH}-${DAY}-${HOUR}</uri-template>

However while passing this to MR job I need to append a Regex Pattern:
                <property>
                    <name>inPath</name>

<value>${coord:dataIn('inPath-event')}/clickon/*/*/OK/*/*</value>
                </property>

But oozie seems to append this path only to the last resolved dataset event
and not to all, how do I define the coordinator so that every resolved
dataset event has the regex pattern? Any help is appreciated or else we
might need to create custom EL function.

Here is my coordinator.xml:

<coordinator-app name="rm-coord" frequency="${coord:days(1)}"
start="${start}" end="${end}" timezone="UTC"
                 xmlns="uri:oozie:coordinator:0.2">
    <controls>
        <concurrency>1</concurrency>
    </controls>

    <datasets>
        <dataset name="inPath" frequency="${coord:hours(1)}"
initial-instance="2012-01-30T00:00Z" timezone="UTC">

<uri-template>${nameNode}/projects/bi/ioout/${YEAR}-${MONTH}-${DAY}-${HOUR}</uri-template>
        <done-flag></done-flag>
        </dataset>
        <dataset name="interPath" frequency="${coord:days(1)}"
initial-instance="2012-01-30T01:00Z" timezone="UTC">

<uri-template>/projects/bi/ioout/interactions/${YEAR}-${MONTH}-${DAY}</uri-template>
    <done-flag></done-flag>
        </dataset>
        <dataset name="outputhdfsdir" frequency="${coord:days(1)}"
initial-instance="2012-01-31T01:00Z" timezone="UTC">

<uri-template>/projects/bi/rmc/daily/AdvInteractionSummary</uri-template>
        <done-flag></done-flag>
        </dataset>
    </datasets>

    <input-events>
        <data-in name="inPath-event" dataset="inPath">
            <start-instance>${coord:current(-26)}</start-instance>
            <end-instance>${coord:current(0)}</end-instance>
        </data-in>
    </input-events>
    <output-events>
        <data-out name="interPath-event" dataset="interPath">
            <instance>${coord:current(-1)}</instance>
        </data-out>
        <data-out name="outputhdfsdir-event" dataset="outputhdfsdir">
            <instance>${coord:current(0)}</instance>
        </data-out>
    </output-events>

    <action>
        <workflow>
            <app-path>${nameNode}/user/${coord:user()}/apps/rm</app-path>
            <configuration>
                <property>
                    <name>jobTracker</name>
                    <value>${jobTracker}</value>
                </property>
                <property>
                    <name>nameNode</name>
                    <value>${nameNode}</value>
                </property>
                <property>
                    <name>queueName</name>
                    <value>${queueName}</value>
                </property>
                <property>
                    <name>logTime</name>
                    <value>${coord:formatTime(coord:actualTime(),
'yyyy-MM-dd')}</value>
                </property>
                <property>
                    <name>inPath</name>

<value>${coord:dataIn('inPath-event')}/clickon/*/*/OK/*/*</value>
                </property>
                <property>
                    <name>interPath</name>
                    <value>${coord:dataOut('interPath-event')}</value>
                </property>
                <property>
                    <name>outputhdfsdir</name>
                    <value>${coord:dataOut('outputhdfsdir-event')}</value>
                </property>
            </configuration>
        </workflow>
    </action>
</coordinator-app>


And this is how the resolved Workflow looks like:

<java xmlns="uri:oozie:workflow:0.2">
  <job-tracker>shaik-idris:8021</job-tracker>
  <name-node>hdfs://shaik-idris:8020</name-node>
  <configuration>
    <property>
      <name>mapred.job.queue.name</name>
      <value>default</value>
    </property>
  </configuration>

<main-class>com.inmobi.grid.richmedia.aggregation.InteractionAggregation</main-class>
  <arg>-Dlog.time=2012-02-12</arg>

<arg>-Dlog.in.path=hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-31-01,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-31-00,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-23,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-22,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-21,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-20,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-19,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-18,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-17,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-16,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-15,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-14,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-13,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-12,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-11,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-10,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30
-09,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-08,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-07,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-06,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-05,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-04,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-03,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-02,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-01,hdfs://shaik-idris:8020/projects/bi/ioout/2012-01-30-00/clickon/*/*/OK/*/*</arg>
  <arg>-Dout.path=/projects/bi/ioout/interactions/2012-01-30</arg>

<arg>-Dtmpjars=hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/rm-reports-0.1.jar,hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/io-lib-0.3-SNAPSHOT.jar,hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/libthrift-0.5.0-cdh.jar,hdfs://shaik-idris:8020/user/shaik.idris/apps/rm/lib/hive-exec-0.7.0-cdh3u0.jar</arg>
  <file>/user/shaik.idris/apps/rm/lib/rm-reports-0.1.jar</file>
  <file>/user/shaik.idris/apps/rm/lib/io-lib-0.3-SNAPSHOT.jar</file>
  <file>/user/shaik.idris/apps/rm/lib/libthrift-0.5.0-cdh.jar</file>
  <file>/user/shaik.idris/apps/rm/lib/hive-exec-0.7.0-cdh3u0.jar</file>
  <capture-output />
</java>

Thanks,
-Idris