You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by Rick Taylor <rx...@gmail.com> on 2012/03/21 03:40:41 UTC

Using RegexTransformer to find underlined text

I am trying to extract subject headings from the text of a PDF document
using TikaEntityProcessor and RegexTransformer.  To do that, I need to find
the phrase *Subject Headings*, which is underlined.  I can't seem to find
the correct regex for this.  Any suggestions?

Here is my data-config,xml:

<dataConfig>
  <dataSource type="BinFileDataSource" name="bin"/>
  <document>
    <entity name="f" processor="FileListEntityProcessor"
baseDir="/Applications/apache-solr-3.5.0/oral_history/data" fileName=".pdf"
rootEntity="false">
   <entity processor="TikaEntityProcessor" url="${f.fileAbsolutePath}"
dataSource="bin" transformer="RegexTransformer" >
      <field column="title" meta="true" regex="([oO][hH]-[0-9]{1,3})"
name="docName"/>
      <field column="text" />
      <field column="interviewee" name="interviewee"
regex="(.+)([\n]?)(\(OH-.*)" groupNames="interviewee, , "
sourceColName="text"  />
     <field column="interviewDate" name="interviewDate" regex="(Interview
Date:)(\s{1,}\w+\s{1,}\d{1,2},\s\d{4})(.*)" groupNames=" ,interviewDate, "
sourceColName="text" />
<field column="subjects" name="subjects" regex="(Subject Headings)"
groupNames="subjects" sourceColName="text" />
    </entity>
    </entity>
    </document>
</dataConfig>