You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by Rick Taylor <rx...@gmail.com> on 2012/03/21 03:40:41 UTC
Using RegexTransformer to find underlined text
I am trying to extract subject headings from the text of a PDF document
using TikaEntityProcessor and RegexTransformer. To do that, I need to find
the phrase *Subject Headings*, which is underlined. I can't seem to find
the correct regex for this. Any suggestions?
Here is my data-config,xml:
<dataConfig>
<dataSource type="BinFileDataSource" name="bin"/>
<document>
<entity name="f" processor="FileListEntityProcessor"
baseDir="/Applications/apache-solr-3.5.0/oral_history/data" fileName=".pdf"
rootEntity="false">
<entity processor="TikaEntityProcessor" url="${f.fileAbsolutePath}"
dataSource="bin" transformer="RegexTransformer" >
<field column="title" meta="true" regex="([oO][hH]-[0-9]{1,3})"
name="docName"/>
<field column="text" />
<field column="interviewee" name="interviewee"
regex="(.+)([\n]?)(\(OH-.*)" groupNames="interviewee, , "
sourceColName="text" />
<field column="interviewDate" name="interviewDate" regex="(Interview
Date:)(\s{1,}\w+\s{1,}\d{1,2},\s\d{4})(.*)" groupNames=" ,interviewDate, "
sourceColName="text" />
<field column="subjects" name="subjects" regex="(Subject Headings)"
groupNames="subjects" sourceColName="text" />
</entity>
</entity>
</document>
</dataConfig>