You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by pk...@apache.org on 2012/10/15 18:22:24 UTC
svn commit: r1398363 [3/3] - in
/uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook: ./
images/tools/tm/ images/tools/tm/workbench/
images/tools/tm/workbench/explain/ images/tools/tm/workbench/overview/
images/tools/tm/workbench/projects...
Added: uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.projects.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.projects.xml?rev=1398363&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.projects.xml (added)
+++ uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.projects.xml Mon Oct 15 16:22:23 2012
@@ -0,0 +1,201 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE section PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
+"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"[
+<!ENTITY imgroot "images/tools/tm/workbench/" >
+<!ENTITY % uimaents SYSTEM "../../target/docbook-shared/entities.ent" >
+%uimaents;
+]>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+
+
+<section id="section.ugr.tools.tm.workbench.projects">
+ <title>TextMarker Projects</title>
+ <para>
+ TextMarker projects used within the TextMarker workbench need to have
+ a certain folder
+ structure. The parts of this folder structure are
+ explained in
+ <xref linkend='table.ugr.tools.tm.workbench.create_project.folder_strucutre' />
+ . To create a TextMarker project it is recommended to use the provided
+ wizard, explained in
+ <xref linkend='section.ugr.tools.tm.workbench.projects.create_projects' />
+ . If this wizard is used, the required folder structure is
+ automatically created.
+ </para>
+
+ <para>
+ <table id="table.ugr.tools.tm.workbench.create_project.folder_strucutre"
+ frame="all">
+ <title>Project folder structure</title>
+ <tgroup cols="2" colsep="1" rowsep="1">
+ <colspec colname="c1" colwidth="1*" />
+ <colspec colname="c2" colwidth="4*" />
+ <thead>
+ <row>
+ <entry align="center">Folder</entry>
+ <entry align="center">Description</entry>
+ </row>
+ </thead>
+ <tbody>
+ <row>
+ <entry>script</entry>
+ <entry>
+ Source folder for TextMarker scripts and packages.
+ </entry>
+ </row>
+ <row>
+ <entry>descriptor</entry>
+ <entry>
+ Build folder for UIMA components. Analysis engines and type
+ systems
+ are created automatically from the related script files.
+ </entry>
+ </row>
+ <row>
+ <entry>input</entry>
+ <entry>
+ Folder that contains the files that will be processed when
+ launching a
+ TextMarker script. Such input files could be plain
+ text,
+ HTML, xmiCAS files or others.
+ </entry>
+ </row>
+ <row>
+ <entry>output</entry>
+ <entry>
+ Folder that contains the resulting xmiCAS files. One xmiCAS
+ file is generated for each associated document in the input
+ folder.
+ </entry>
+ </row>
+ <row>
+ <entry>resources</entry>
+ <entry>
+ Default folder for word lists, dictionaries and tables.
+ </entry>
+ </row>
+ <row>
+ <entry>test</entry>
+ <entry>
+ Folder for test-driven development.
+ </entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+ </para>
+ <para>
+ <xref linkend='figure.ugr.tools.tm.workbench.projects.test_project' />
+ shows a project, newly created with the wizard.
+
+ <figure id="figure.ugr.tools.tm.workbench.projects.test_project">
+ <title>A newly created TextMarker project</title>
+ <mediaobject>
+ <imageobject role="html">
+ <imagedata width="300px" format="PNG" align="center"
+ fileref="&imgroot;projects/test_project.PNG" />
+ </imageobject>
+ <imageobject role="fo">
+ <imagedata width="3.5in" format="PNG" align="center"
+ fileref="&imgroot;projects/test_project.PNG" />
+ </imageobject>
+ <textobject>
+ <phrase>
+ A newly created TextMarker project.
+ </phrase>
+ </textobject>
+ </mediaobject>
+ </figure>
+ </para>
+
+ <section id="section.ugr.tools.tm.workbench.projects.create_projects">
+ <title>TextMarker create project wizard</title>
+ <para>
+ To create a new TextMarker project switch to TextMarker perspective
+ and click
+ <quote>File → New → TextMarker Project</quote>
+ . This opens the corresponding wizard.
+ </para>
+
+ <para>
+ <xref
+ linkend='figure.ugr.tools.tm.workbench.projects.create_projects.wizard1' />
+ shows the start page of the wizard.
+ <figure
+ id="figure.ugr.tools.tm.workbench.projects.create_projects.wizard1">
+ <title>Wizard start page</title>
+ <mediaobject>
+ <imageobject role="html">
+ <imagedata width="450px" format="PNG" align="center"
+ fileref="&imgroot;projects/wizard1.PNG" />
+ </imageobject>
+ <imageobject role="fo">
+ <imagedata width="4.5in" format="PNG" align="center"
+ fileref="&imgroot;projects/wizard1.PNG" />
+ </imageobject>
+ <textobject>
+ <phrase>
+ Wizard start page.
+ </phrase>
+ </textobject>
+ </mediaobject>
+ </figure>
+ </para>
+ <para>
+ To create a simple TextMarker project just enter a project name for
+ your project and click
+ <quote>Finish</quote>
+ . This will create all you need to start.
+ </para>
+ <para>
+ Other possible settings on this page are the desired location of
+ the project,
+ the interpreter to use and the working set you wish to
+ work on, all of them really self-explaining.
+ </para>
+ <para>
+ On the second page of the wizard you can mainly configure the
+ needed build path. This is necessary if you like to use external
+ source
+ folders or if the project to create will be dependent on other
+ projects or if external libraries have to be found. Add the desired
+ configuration in the related tab.
+ </para>
+ <para>
+ <xref
+ linkend='figure.ugr.tools.tm.workbench.projects.create_projects.wizard2' />
+ shows the second page of the wizard.
+ <figure
+ id="figure.ugr.tools.tm.workbench.projects.create_projects.wizard2">
+ <title>Wizard second page</title>
+ <mediaobject>
+ <imageobject role="html">
+ <imagedata width="450px" format="PNG" align="center"
+ fileref="&imgroot;projects/wizard2.PNG" />
+ </imageobject>
+ <imageobject role="fo">
+ <imagedata width="4.5in" format="PNG" align="center"
+ fileref="&imgroot;projects/wizard2.PNG" />
+ </imageobject>
+ <textobject>
+ <phrase>
+ Wizard second page.
+ </phrase>
+ </textobject>
+ </mediaobject>
+ </figure>
+ </para>
+
+ </section>
+
+</section>
\ No newline at end of file
Added: uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.query.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.query.xml?rev=1398363&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.query.xml (added)
+++ uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.query.xml Mon Oct 15 16:22:23 2012
@@ -0,0 +1,148 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE section PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
+"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"[
+<!ENTITY imgroot "images/tools/tm/workbench/" >
+<!ENTITY % uimaents SYSTEM "../../target/docbook-shared/entities.ent" >
+%uimaents;
+]>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+
+<section id="section.ugr.tools.tm.workbench.tm_query">
+ <title>Query View</title>
+ <para>
+ With the Query View the TextMarker language can be used to write
+ queries on a set of
+ documents. A query is simply a set of TextMarker
+ rules. Each query returns a list of all text
+ passages the query
+ applies to. For example, if you have a set of annotated documents
+ containing a
+ number of Author annotations, you could use the Query
+ View to get a list of all the author names
+ associated with these
+ annotations.
+ </para>
+ <para>
+ <figure id="figure.ugr.tools.tm.workbench.tm_query.query_view">
+ <title>
+ The Query View.
+ <emphasis role="bold">(1)</emphasis>
+ Start Button;
+ <emphasis role="bold">(2)</emphasis>
+ Export Button
+ </title>
+ <mediaobject>
+ <imageobject>
+ <imagedata scale="80" format="PNG"
+ fileref="&imgroot;query/query_numbers_tm2.png" />
+ </imageobject>
+ <textobject>
+ <phrase>Query View</phrase>
+ </textobject>
+ </mediaobject>
+ </figure>
+ </para>
+ <para>
+ Use the Query view as follows:
+ <orderedlist numeration="arabic">
+ <listitem>
+ <para>
+ The field
+ <quote>Query Data</quote>
+ specifies the folder containing the
+ documents on which the
+ query should be executed. You can
+ either
+ click on the button next to the field to specify
+ the folder by
+ browsing through the
+ file system, or you can drag and drop a
+ folder directly
+ into the field. If the checkbox is
+ activated, all
+ subfolders are included.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ The field
+ <quote>Type System</quote>
+ has to contain a type system or a
+ TextMarker script that
+ specifies all types that are used
+ in
+ the
+ query. You can either click on the button next to
+ the field to
+ specify the type system
+ by browsing through the file system, or
+ you can drag
+ and drop a type system directly into
+ the field.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ The query in form of one or more TextMarker rules is
+ specified in
+ the
+ text field in the
+ middle of the view. The
+ screenshot shows ...
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ After pressing the start button the query is started. The
+ results are subsequently
+ displayed in the bottom text field.
+ </para>
+ </listitem>
+ </orderedlist>
+ </para>
+ <para>
+ The resulting list consists of all text passages the query
+ applied to. Above the text
+ field,
+ information about the entire number of
+ matches and the
+ number of different documents
+ the query
+ applied to is given. Each item in
+ the list
+ shows both the matched text passage and
+ in brackets the
+ document related to the text
+ passage. By double-clicking on one
+ of
+ the listed
+ items, the related
+ document is opened in the editor and the matched text passage is
+ selected. If the related
+ document is already open you can jump to another matched text
+ passage within the the same
+ document with just one click on the listed item. Of course
+ this text passage is then selected. By
+ clicking on the export button a list of
+ all matched
+ text passaged
+ is showed in a
+ separate window.
+ For further usage, e.g. as a list
+ of authors in
+ another TextMarker
+ project, copy the content of
+ this
+ window to another text
+ file.
+ </para>
+</section>
\ No newline at end of file
Added: uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.testing.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.testing.xml?rev=1398363&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.testing.xml (added)
+++ uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.testing.xml Mon Oct 15 16:22:23 2012
@@ -0,0 +1,284 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE section PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
+"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"[
+<!ENTITY imgroot "images/tools/tm/workbench/" >
+<!ENTITY % uimaents SYSTEM "../../target/docbook-shared/entities.ent" >
+%uimaents;
+]>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.
+ See the NOTICE file distributed with this work for additional information regarding copyright ownership.
+ The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not
+ use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software distributed under the License is
+ distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and limitations under the License. -->
+
+<section id="ugr.tools.tm.testing">
+ <title>Testing</title>
+ <para> The TextMarker Software comes bundled with its own testing environment, that allows you
+ to test and evaluate TextMarker scripts. It provides full back end testing capabilities and
+ allows you to examine test results in detail. As a product of the testing operation a new
+ document file will be created and detailed information on how well the script performed in the
+ test will be added to this document.
+ </para>
+ <section id="ugr.tools.tm.testing.overview">
+ <title>Overview</title>
+ <para>
+ The testing procedure compares a previously annotated gold standard file with the result of
+ the selected TextMarker script using an evaluator. The evaluators compare the offsets of
+ annotations in both documents and, depending on the evaluator, mark a result document with
+ true positive, false positive or false negative annotations. Afterwards the f1-score is
+ calculated for the whole set of tests, each test file and each type in the test file. The
+ testing environment contains the following parts :
+ <itemizedlist>
+ <listitem>
+ <para>Main view</para>
+ </listitem>
+ <listitem>
+ <para>Result views : true positive, false positive, false negative view
+ </para>
+ </listitem>
+ <listitem>
+ <para>Preference page</para>
+ </listitem>
+ </itemizedlist>
+ <screenshot>
+ <mediaobject>
+ <imageobject>
+ <imagedata scale="80" format="PNG" fileref="&imgroot;Screenshot_main.png" />
+ </imageobject>
+ <textobject>
+ <phrase>Eclipse with open TextMarker and testing environment.
+ </phrase>
+ </textobject>
+ </mediaobject>
+ </screenshot>
+ All control elements,that are needed for the interaction with the testing environment, are
+ located in the main view. This is also where test files can be selected and information, on
+ how well the script performed is, displayed. During the testing process a result CAS file is
+ produced that will contain new annotation types like true positives (tp), false positives
+ (fp) and false negatives (fn). While displaying the result .xmi file in the script editor,
+ additional views allow easy navigation through the new annotations. Additional tree views,
+ like the true positive view, display the corresponding annotations in a hierarchic
+ structure. This allows an easy tracing of the results inside the testing document. A
+ preference page allows customization of the behavior of the testing plug-in.
+ </para>
+ <section id="ugr.tools.tm.testing.overview.main">
+ <title>Main View</title>
+ <para>
+ The following picture shows a close up view of the testing environments main-view part.
+ The toolbar contains all buttons needed to operate the plug-ins. The first line shows the
+ name of the script that is going to be tested and a combo-box, where the view, that should
+ be tested, is selected. On the right follow fields that will show some basic information
+ of the results of the test-run. Below and on the left the test-list is located. This list
+ contains the different test-files. Right besides it, you will find a table with statistic
+ information. It shows a total tp, fp and fn information, as well as precision, recall and
+ f1-score of every test-file and for every type in each file.
+ <screenshot>
+ <mediaobject>
+ <imageobject>
+ <imagedata scale="80" format="PNG" fileref="&imgroot;Screenshot_testing_desc_3_resize.png" />
+ </imageobject>
+ <textobject>
+ <phrase>The main view of the testing environment.</phrase>
+ </textobject>
+ </mediaobject>
+ </screenshot>
+ </para>
+ </section>
+ <section id="ugr.tools.tm.testing.overview.result">
+ <title>Result Views</title>
+ <para>
+ This views add additional information to the CAS View, once a result file is opened. Each
+ view displays one of the following annotation types in a hierarchic tree structure : true
+ positives, false positive and false negative. Adding a check mark to one of the
+ annotations in a result view, will highlight the annotation in the CAS Editor.
+ <screenshot>
+ <mediaobject>
+ <imageobject>
+ <imagedata scale="80" format="PNG" fileref="&imgroot;Screenshot_result.png" />
+ </imageobject>
+ <textobject>
+ <phrase>The main view of the testing environment.</phrase>
+ </textobject>
+ </mediaobject>
+ </screenshot>
+ </para>
+ </section>
+ <section id="ugr.tools.tm.testing.overview.preferences">
+ <title>Preference Page</title>
+ <para>
+ The preference page offers a few options that will modify the plug-ins general behavior.
+ For example the preloading of previously collected result data can be turned off, should
+ it produce a to long loading time. An important option in the preference page is the
+ evaluator you can select. On default the "exact evaluator" is selected, which compares the
+ offsets of the annotations, that are contained in the file produced by the selected
+ script, with the annotations in the test file. Other evaluators will compare annotations
+ in a different way.
+ <screenshot>
+ <mediaobject>
+ <imageobject>
+ <imagedata scale="80" format="PNG" fileref="&imgroot;Screenshot_preferences.png" />
+ </imageobject>
+ <textobject>
+ <phrase>The preference page of the testing environment.
+ </phrase>
+ </textobject>
+ </mediaobject>
+ </screenshot>
+ </para>
+ </section>
+ <section id="ugr.tools.tm.testing.overview.project">
+ <title>The TextMarker Project Structure</title>
+ <para>
+ The picture shows the TextMarker's script explorer. Every TextMarker project contains a
+ folder called "test". This folder is the default location for the test-files. In the
+ folder each script-file has its own sub-folder with a relative path equal to the scripts
+ package path in the "script" folder. This folder contains the test files. In every scripts
+ test-folder you will also find a result folder with the results of the tests. Should you
+ use test-files from another location in the file-system, the results will be saved in the
+ "temp" sub-folder of the projects "test" folder. All files in the "temp" folder will be
+ deleted, once eclipse is closed.
+ <screenshot>
+ <mediaobject>
+ <imageobject>
+ <imagedata scale="80" format="PNG" fileref="&imgroot;folder_struc_sep_desc_cut.png" />
+ </imageobject>
+ <textobject>
+ <phrase>Script Explorer with the test folder expanded.</phrase>
+ </textobject>
+ </mediaobject>
+ </screenshot>
+ </para>
+ </section>
+ </section>
+
+ <section id="ugr.tools.tm.testing.usage">
+ <title>Usage</title>
+ <para> This section will demonstrate how to use the testing environment. It will show the
+ basic actions needed to perform a test run.
+ </para>
+ <para> Preparing Eclipse: The testing environment provides its own perspective called
+ "TextMarker Testing". It will display the main view as well as the different result views on
+ the right hand side. It is encouraged to use this perspective, especially when working with
+ the testing environment for the first time.
+ </para>
+ <para> Selecting a script for testing: TextMarker will always test the script, that is
+ currently open in the script-editor. Should another editor be open, for example a
+ java-editor with some java class being displayed, you will see that the testing view is not
+ available.
+ </para>
+ <para> Creating a test file: A test-file is a previously annotated .xmi file that can be used
+ as a golden standard for the test. To create such a file, no additional tools will be
+ provided, instead the TextMarker system already provides such tools.
+ </para>
+ <para> Selecting a test-file: Test files can be added to the test-list by simply dragging them
+ from the Script Explorer into the test-file list. Depending on the setting in the preference
+ page, test-files from a scripts "test" folder might already be loaded into the list. A
+ different way to add test-files is to use the "Add files from folder" button. It can be used
+ to add all .xmi files from a selected folder. The "del" key can be used to remove files from
+ the test-list.
+ </para>
+ <para> Selecting a CAS View to test: TextMarker supports different views, that allow you to
+ operate on different levels in a document. The InitialView is selected as default, however
+ you can also switch the evaluation to another view by typing the views name into the list or
+ selecting the view you wish to use from the list.
+ </para>
+ <para> Selecting the evaluator: The testing environment supports different evaluators that
+ allow a sophisticated analysis of the behavior of a TextMarker script. The evaluator can be
+ chosen in the testing environments preference page. The preference page can be opened either
+ trough the menu or by clicking the blue preference buttons in the testing views toolbar. The
+ default evaluator is the "Exact CAS Evaluator" which compares the offsets of the annotations
+ between the test file and the file annotated by the tested script.
+ </para>
+ <para> Excluding Types: During a test-run it might be convenient to disable testing for
+ specific types like punctuation or tags. The ''exclude types`` button will open a dialog
+ where all types can be selected that should not be considered in the test.
+ </para>
+ <para> Running the test: A test-run can be started by clicking on the green start button in
+ the toolbar.
+ </para>
+ <para> Result Overview: The testing main view displays some information, on how well the
+ script did, after every test run. It will display an overall number of true positive, false
+ positive and false negatives annotations of all result files as well as an overall f1-score.
+ Furthermore a table will be displayed that contains the overall statistics of the selected
+ test file as well as statistics for every single type in the test file. The information
+ displayed are true positives, false positives, false negatives, precision, recall and
+ f1-measure.
+ </para>
+ <para> The testing environment also supports the export of the overall data in form of a
+ comma-separated table. Clicking the export evaluation data will open a dialog window that
+ contains this table. The text in this table can be copied and easily imported into
+ OpenOffice.org or MS Excel.
+ </para>
+ <para>
+ Result Files: When running a test, the evaluator will create a new result .xmi file and will
+ add new true positive, false positive and false negative annotations. By clicking on a file
+ in the test-file list, you can open the corresponding result .xmi file in the TextMarker
+ script editor. When opening a result file in the script explorer, additional views will
+ open, that allow easy access and browsing of the additional debugging annotations.
+ <screenshot>
+ <mediaobject>
+ <imageobject>
+ <imagedata scale="80" format="PNG"
+ fileref="&imgroot;Screenshot_Result_TP_desc_close_cut.png" />
+ </imageobject>
+ <textobject>
+ <phrase>Open result file and selected true positive annotation in the true positive
+ view.
+ </phrase>
+ </textobject>
+ </mediaobject>
+ </screenshot>
+ </para>
+ </section>
+ <section id="ugr.tools.tm.testing.evaluators">
+ <title>Evaluators</title>
+ <para> When testing a CAS file, the system compared the offsets of the annotations of a
+ previously annotated gold standard file with the offsets of the annotations of the result
+ file the script produced. Responsible for comparing annotations in the two CAS files are
+ evaluators. These evaluators have different methods and strategies, for comparing the
+ annotations, implemented. Also a extension point is provided that allows easy implementation
+ new evaluators.
+ </para>
+ <para> Exact Match Evaluator: The Exact Match Evaluator compares the offsets of the
+ annotations in the result and the golden standard file. Any difference will be marked with
+ either an false positive or false negative annotations.
+ </para>
+ <para> Partial Match Evaluator: The Partial Match Evaluator compares the offsets of the
+ annotations in the result and golden standard file. It will allow differences in the
+ beginning or the end of an annotation. For example "corresponding" and "corresponding " will
+ not be annotated as an error.
+ </para>
+ <para> Core Match Evaluator: The Core Match Evaluator accepts annotations that share a core
+ expression. In this context a core expression is at least four digits long and starts with a
+ capitalized letter. For example the two annotations "L404-123-421" and "L404-321-412" would
+ be considered a true positive match, because of "L404" is considered a core expression that
+ is contained in both annotations.
+ </para>
+ <para> Word Accuracy Evaluator: Compares the labels of all words/numbers in an annotation,
+ whereas the label equals the type of the annotation. This has the consequence, for example,
+ that each word or number that is not part of the annotation is counted as a single false
+ negative. For example we have the sentence: "Christmas is on the 24.12 every year." The
+ script labels "Christmas is on the 12" as a single sentence, while the test file labels the
+ sentence correctly with a single sentence annotation. While for example the Exact CAS
+ Evaluator while only assign a single False Negative annotation, Word Accuracy Evaluator will
+ mark every word or number as a single False Negative.
+ </para>
+ <para> Template Only Evaluator: This Evaluator compares the offsets of the annotations and the
+ features, that have been created by the script. For example the text "Alan Mathison Turing"
+ is marked with the author annotation and "author" contains 2 features: "FirstName" and
+ "LastName". If the script now creates an author annotation with only one feature, the
+ annotation will be marked as a false positive.
+ </para>
+ <para> Template on Word Level Evaluator: The Template On Word Evaluator compares the offsets
+ of the annotations. In addition it also compares the features and feature structures and the
+ values stored in the features. For example the annotation "author" might have features like
+ "FirstName" and "LastName" The authors name is "Alan Mathison Turing" and the script
+ correctly assigns the author annotation. The feature assigned by the script are "Firstname :
+ Alan", "LastName : Mathison", while the correct feature values would be "FirstName Alan",
+ "LastName Turing". In this case the Template Only Evaluator will mark an annotation as a
+ false positive, since the feature values differ.
+ </para>
+ </section>
+</section>
\ No newline at end of file
Added: uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.textruler.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.textruler.xml?rev=1398363&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.textruler.xml (added)
+++ uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.textruler.xml Mon Oct 15 16:22:23 2012
@@ -0,0 +1,152 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE section PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
+"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"[
+<!ENTITY imgroot "images/tools/tm/workbench/" >
+<!ENTITY % uimaents SYSTEM "../../target/docbook-shared/entities.ent" >
+%uimaents;
+]>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.
+ See the NOTICE file distributed with this work for additional information regarding copyright ownership.
+ The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not
+ use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software distributed under the License is
+ distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and limitations under the License. -->
+
+<section id="section.ugr.tools.tm.workbench.textruler">
+ <title>TextRuler</title>
+ <para> Using the knowledge engineering approach, a knowledge engineer normally writes
+ handcrafted rules to create a domain dependent information extraction application, often
+ supported by a gold standard. When starting the engineering process for the acquisition of the
+ extraction knowledge for possibly new slot or more general for new concepts, machine learning
+ methods are often able to offer support in an iterative engineering process. This section
+ gives a conceptual overview of the process model for the semi-automatic development of
+ rule-based information extraction applications.
+ </para>
+ <para> First, a suitable set of documents that contain the text fragments with interesting
+ patterns needs to be selected and annotated with the target concepts. Then, the knowledge
+ engineer chooses and configures the methods for automatic rule acquisition to the best of his
+ knowledge for the learning task: Lambda expressions based on tokens and linguistic features,
+ for example, differ in their application domain from wrappers that process generated HTML
+ pages.
+ </para>
+ <para> Furthermore, parameters like the window size defining relevant features need to be set to
+ an appropriate level. Before the annotated training documents form the input of the learning
+ task, they are enriched with features generated by the partial rule set of the developed
+ application. The result of the methods, that is the learned rules, are proposed to the
+ knowledge engineer for the extraction of the target concept.
+ </para>
+ <para> The knowledge engineer has different options to proceed: If the quality, amount or
+ generality of the presented rules is not sufficient, then additional training documents need
+ to be annotated or additional rules have to be handcrafted to provide more features in general
+ or more appropriate features. Rules or rule sets of high quality can be modified, combined or
+ generalized and transfered to the rule set of the application in order to support the
+ extraction task of the target concept. In the case that the methods did not learn reasonable
+ rules at all, the knowledge engineer proceeds with writing handcrafted rules.
+ </para>
+ <para> Having gathered enough extraction knowledge for the current concept, the semi-automatic
+ process is iterated and the focus is moved to the next concept until the development of the
+ application is completed.
+ </para>
+ <section id="ugr.tools.tm.textruler.learner">
+ <title>Available Learners</title>
+ <para> Overview ||Name||Strategy||Document||Slots||Status |BWI (1) |Boosting, Top Down
+ |Struct, Semi |Single, Boundary |Planning |LP2 (2) |Bottom Up Cover |All |Single, Boundary
+ |Prototype |RAPIER (3) |Top Down/Bottom Up Compr. |Semi |Single |Experimental |WHISK (4)
+ |Top Down Cover |All |Multi |Prototype |WIEN (5) |CSP |Struct |Multi, Rows |Prototype
+ </para>
+ <para> * Strategy: The used strategy of the learning methods are commonly coverage algorithms.
+ * Document: The type of the document may be ''free'' like in newspapers, ''semi'' or
+ ''struct'' like HTML pages. * Slots: The slots refer to a single annotation that represents
+ the goal of the learning task. Some rule are able to create several annotation at once in
+ the same context (multi-slot). However, only single slots are supported by the current
+ implementations. * Status: The current status of the implementation in the TextRuler
+ framework.
+ </para>
+ <para> Publications
+ </para>
+ <para> (1) Dayne Freitag and Nicholas Kushmerick. Boosted Wrapper Induction. In AAAI/IAAI,
+ pages 577â583, 2000.
+ </para>
+ <para> (2) F. Ciravegna. (LP)2, Rule Induction for Information Extraction Using Linguistic
+ Constraints. Technical Report CS-03-07, Department of Computer Science, University of
+ Sheffield, Sheffield, 2003.
+ </para>
+ <para> (3) Mary Elaine Califf and Raymond J. Mooney. Bottom-up Relational Learning of Pattern
+ Matching Rules for Information Extraction. Journal of Machine Learning Research, 4:177â210,
+ 2003.
+ </para>
+ <para> (4) Stephen Soderland, Claire Cardie, and Raymond Mooney. Learning Information
+ Extraction Rules for Semi-Structured and Free Text. In Machine Learning, volume 34, pages
+ 233â272, 1999.
+ </para>
+ <para> (5) N. Kushmerick, D. Weld, and B. Doorenbos. Wrapper Induction for Information
+ Extraction. In Proc. IJC Artificial Intelligence, 1997.
+ </para>
+ <para> BWI BWI (Boosted Wrapper Induction) uses boosting techniques to improve the performance
+ of simple pattern matching single-slot boundary wrappers (boundary detectors). Two sets of
+ detectors are learned: the "fore" and the "aft" detectors. Weighted by their confidences and
+ combined with a slot length histogram derived from the training data they can classify a
+ given pair of boundaries within a document. BWI can be used for structured, semi-structured
+ and free text. The patterns are token-based with special wildcards for more general rules.
+ </para>
+ <para> Implementations No implementations are yet available.
+ </para>
+ <para> Parameters No parameters are yet available.
+ </para>
+ <para> LP2 This method operates on all three kinds of documents. It learns separate rules for
+ the beginning and the end of a single slot. So called tagging rules insert boundary SGML
+ tags and additionally induced correction rules shift misplaced tags to their correct
+ positions in order to improve precision. The learning strategy is a bottom-up covering
+ algorithm. It starts by creating a specific seed instance with a window of w tokens to the
+ left and right of the target boundary and searches for the best generalization. Other
+ linguistic NLP-features can be used in order to generalize over the flat word sequence.
+ </para>
+ <para> Implementations LP2 (naive): LP2 (optimized):
+ </para>
+ <para> Parameters Context Window Size (to the left and right): Best Rules List Size: Minimum
+ Covered Positives per Rule: Maximum Error Threshold: Contextual Rules List Size:
+ </para>
+ <para> RAPIER RAPIER induces single slot extraction rules for semi-structured documents. The
+ rules consist of three patterns: a pre-filler, a filler and a post-filler pattern. Each can
+ hold several constraints on tokens and their according POS-tag- and semantic information.
+ The algorithm uses a bottom-up compression strategy, starting with a most specific seed rule
+ for each training instance. This initial rule base is compressed by randomly selecting rule
+ pairs and search for the best generalization. Considering two rules, the least general
+ generalization (LGG) of the slot fillers are created and specialized by adding rule items to
+ the pre- and post-filler until the new rules operate well on the training set. The best of
+ the k rules (k-beam search) is added to the rule base and all empirically subsumed rules are
+ removed.
+ </para>
+ <para> Implementations RAPIER:
+ </para>
+ <para> Parameters Maximum Compression Fail Count: Internal Rules List Size: Rule Pairs for
+ Generalizing: Maximum 'No improvement' Count: Maximum Noise Threshold: Minimum Covered
+ Positives Per Rule: PosTag Root Type: Use All 3 GenSets at Specialization:
+ </para>
+ <para> WHISK WHISK is a multi-slot method that operates on all three kinds of documents and
+ learns single- or multi-slot rules looking similar to regular expressions. The top-down
+ covering algorithm begins with the most general rule and specializes it by adding single
+ rule terms until the rule makes no errors on the training set. Domain specific classes or
+ linguistic information obtained by a syntactic analyzer can be used as additional features.
+ The exact definition of a rule term (e.g. a token) and of a problem instance (e.g. a whole
+ document or a single sentence) depends on the operating domain and document type.
+ </para>
+ <para> Implementations WHISK (token): WHISK (generic):
+ </para>
+ <para> Parameters Window Size: Maximum Error Threshold: PosTag Root Type:
+ </para>
+ <para> WIEN WIEN is the only method listed here that operates on highly structured texts only.
+ It induces so called wrappers that anchor the slots by their structured context around them.
+ The HLRT (head left right tail) wrapper class for example can determine and extract several
+ multi-slot-templates by first separating the important information block from unimportant
+ head and tail portions and then extracting multiple data rows from table like data
+ structures from the remaining document. Inducing a wrapper is done by solving a CSP for all
+ possible pattern combinations from the training data.
+ </para>
+ <para> Implementations WIEN:
+ </para>
+ <para> Parameters No parameters are available.
+ </para>
+ </section>
+</section>
\ No newline at end of file
Added: uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.tm_documentation.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.tm_documentation.xml?rev=1398363&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.tm_documentation.xml (added)
+++ uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.tm_documentation.xml Mon Oct 15 16:22:23 2012
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE section PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
+"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"[
+<!ENTITY imgroot "images/tools/tm/workbench/" >
+<!ENTITY % uimaents SYSTEM "../../target/docbook-shared/entities.ent" >
+%uimaents;
+]>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.
+ See the NOTICE file distributed with this work for additional information regarding copyright ownership.
+ The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not
+ use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software distributed under the License is
+ distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and limitations under the License. -->
+
+<section id="section.ugr.tools.tm.workbench.tm_documentation">
+ <title>TextMarker Documentation</title>
+ <para>
+
+ </para>
+
+</section>
\ No newline at end of file
Added: uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.tm_perspective.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.tm_perspective.xml?rev=1398363&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.tm_perspective.xml (added)
+++ uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/workbench/tools.textmarker.workbench.tm_perspective.xml Mon Oct 15 16:22:23 2012
@@ -0,0 +1,133 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE section PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
+"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"[
+<!ENTITY imgroot "images/tools/tm/workbench/" >
+<!ENTITY % uimaents SYSTEM "../../target/docbook-shared/entities.ent" >
+%uimaents;
+]>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+
+<section id="section.ugr.tools.tm.workbench.tm_perspective">
+ <title>TextMarker Perspective</title>
+ <para>
+ The TextMarker perspective is the main view to manage TextMarker
+ projects. There are several views associated with the TextMarker
+ perspective: Annotation Test, Annotation Browser, Selection,
+ TextRuler, TextMarker Query. Since Annotation Test, TextRuler and
+ TextMarker Query have a stand-alone functionality they are explained
+ in separate sections.
+ </para>
+
+ <para>
+ To make it possible to reproduce all of the examples used below,
+ switch to the TextMarker Explain perspective within your Eclipse
+ workbench.
+ Import the TextMarker example project and open the main
+ TextMarker script file 'Main.tm'. Now press the 'Run' button (green
+ arrow)and wait for the end of execution. Open the resulting xmiCAS
+ file
+ 'Test1.txt.xmi', which you can find in the output folder.
+ </para>
+
+ <section
+ id="section.ugr.tools.tm.workbench.tm_perspective.annotation_browser">
+ <title>Annotation Browser</title>
+ <para>
+ The Annotation Browser can be used to view the annotations
+ created by the execution of a TextMarker project. If an xmiCAS file
+ is opened and active in the editor, the related annotations are shown
+ in this view.
+ </para>
+ <para>
+ The result of the execution of the TextMarker example project is
+ shown in
+ <xref
+ linkend='figure.ugr.tools.tm.workbench.tm_perspective.annotation_browser' />
+ .
+ </para>
+ <para>
+ <figure
+ id="figure.ugr.tools.tm.workbench.tm_perspective.annotation_browser">
+ <title> Annotation Browser view
+ </title>
+ <mediaobject>
+ <imageobject role="html">
+ <imagedata width="300px" format="PNG" align="center"
+ fileref="&imgroot;tm/annotation_browser.png" />
+ </imageobject>
+ <imageobject role="fo">
+ <imagedata width="3.0in" format="PNG" align="center"
+ fileref="&imgroot;tm/annotation_browser.png" />
+ </imageobject>
+ <textobject>
+ <phrase>
+ Annotation Browser view.
+ </phrase>
+ </textobject>
+ </mediaobject>
+ </figure>
+ </para>
+ <para>
+ Moreover, this view has two possible filters. Using the
+ <quote>Only types with...</quote>
+ -filter lead to a list containing only those types that contain
+ the
+ entered text. The
+ <quote>Only annotations with...</quote>
+ -filter leads to an analog list.
+ </para>
+ </section>
+
+ <section id="section.ugr.tools.tm.workbench.tm_perspective.selection">
+ <title>Selection</title>
+ <para>
+ The Selection view is very similar to the Annotation Browser
+ view, but only shows annotations that affect a specific text passage.
+ To get such a list, click on any position in the opened xmiCAS
+ document or select a certain text passage.
+ </para>
+ <para>
+ E.g., if you select the text passage
+ <literal>2008</literal>
+ , the Selection view will be generated as shown in
+ <xref
+ linkend='figure.ugr.tools.tm.workbench.tm_perspective.annotation_browser' />
+ .
+ </para>
+ <para>
+ <figure id="figure.ugr.tools.tm.workbench.tm_perspective.selection">
+ <title> Selection view
+ </title>
+ <mediaobject>
+ <imageobject role="html">
+ <imagedata width="560px" format="PNG" align="center"
+ fileref="&imgroot;tm/selection.png" />
+ </imageobject>
+ <imageobject role="fo">
+ <imagedata width="5.5in" format="PNG" align="center"
+ fileref="&imgroot;tm/selection.png" />
+ </imageobject>
+ <textobject>
+ <phrase>
+ Selection view.
+ </phrase>
+ </textobject>
+ </mediaobject>
+ </figure>
+ </para>
+ <para>
+ The Selection view has the same filtering possibilities as
+ described in Annotation Browser view.
+ </para>
+ </section>
+
+</section>
\ No newline at end of file