You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/11/15 21:01:22 UTC
svn commit: r1714495 - in
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic:
GeoParserConfig.java NameEntityExtractor.java
Author: nick
Date: Sun Nov 15 20:01:22 2015
New Revision: 1714495
URL: http://svn.apache.org/viewvc?rev=1714495&view=rev
Log:
Fix inconsistent whitespace
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java?rev=1714495&r1=1714494&r2=1714495&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java Sun Nov 15 20:01:22 2015
@@ -23,34 +23,31 @@ import java.net.MalformedURLException;
import java.net.URL;
public class GeoParserConfig implements Serializable {
+ private static final long serialVersionUID = -3167692634278575818L;
+ private URL nerModelUrl = null;
- private static final long serialVersionUID = 2L;
- private URL nerModelUrl = null;
-
- public GeoParserConfig() {
- this.nerModelUrl = GeoParserConfig.class.getResource("en-ner-location.bin");
- }
-
- public void setNERModelPath(String path) {
- if (path == null)
- return;
- File file = new File(path);
- if (file.isDirectory() || !file.exists()) {
- return;
- }
- try {
- this.nerModelUrl = file.toURI().toURL();
- } catch (MalformedURLException e) {
- throw new RuntimeException(e);
- }
- }
-
- public void setNerModelUrl(URL url) {
- this.nerModelUrl = url;
- }
-
- public URL getNerModelUrl() {
- return nerModelUrl;
- }
-
+ public GeoParserConfig() {
+ this.nerModelUrl = GeoParserConfig.class.getResource("en-ner-location.bin");
+ }
+
+ public void setNERModelPath(String path) {
+ if (path == null)
+ return;
+ File file = new File(path);
+ if (file.isDirectory() || !file.exists()) {
+ return;
+ }
+ try {
+ this.nerModelUrl = file.toURI().toURL();
+ } catch (MalformedURLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public void setNerModelUrl(URL url) {
+ this.nerModelUrl = url;
+ }
+ public URL getNerModelUrl() {
+ return nerModelUrl;
+ }
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java?rev=1714495&r1=1714494&r2=1714495&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java Sun Nov 15 20:01:22 2015
@@ -37,93 +37,88 @@ import org.apache.commons.io.IOUtils;
import static java.nio.charset.StandardCharsets.UTF_8;
public class NameEntityExtractor {
-
- ArrayList<String> locationNameEntities;
- String bestNameEntity;
- private HashMap<String, Integer> tf;
- private final NameFinderME nameFinder;
-
- public NameEntityExtractor(URL modelUrl) throws IOException {
- this.locationNameEntities = new ArrayList<String>();
- this.bestNameEntity = null;
- TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
- this.nameFinder = new NameFinderME(model);
- this.tf = new HashMap<String, Integer>();
- }
-
- /*
- * Use OpenNLP to extract location names that's appearing in the steam.
- * OpenNLP's default Name Finder accuracy is not very good, please refer to
- * its documentation.
- *
- * @param stream stream that passed from this.parse()
- */
-
- public void getAllNameEntitiesfromInput(InputStream stream)
- throws IOException {
-
-
- String[] in = IOUtils.toString(stream, UTF_8).split(" ");
- Span nameE[];
- //name finder is not thread safe https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind
- synchronized (nameFinder) {
- nameE = nameFinder.find(in);
- //the same name finder is reused, so clear adaptive data
- nameFinder.clearAdaptiveData();
- }
-
- String spanNames = Arrays.toString(Span.spansToStrings(nameE, in));
- spanNames = spanNames.substring(1, spanNames.length() - 1);
- String[] tmp = spanNames.split(",");
-
- for (String name : tmp) {
- name = name.trim();
- this.locationNameEntities.add(name);
- }
-
-
- }
-
- /*
- * Get the best location entity extracted from the input stream. Simply
- * return the most frequent entity, If there several highest frequent
- * entity, pick one randomly. May not be the optimal solution, but works.
- *
- * @param locationNameEntities OpenNLP name finder's results, stored in
- * ArrayList
- */
- public void getBestNameEntity() {
- if (this.locationNameEntities.size() == 0)
- return;
-
- for (int i = 0; i < this.locationNameEntities.size(); ++i) {
- if (tf.containsKey(this.locationNameEntities.get(i)))
- tf.put(this.locationNameEntities.get(i),
- tf.get(this.locationNameEntities.get(i)) + 1);
- else
- tf.put(this.locationNameEntities.get(i), 1);
- }
- int max = 0;
- List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(
- tf.entrySet());
- Collections.shuffle(list);
- Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
- public int compare(Map.Entry<String, Integer> o1,
- Map.Entry<String, Integer> o2) {
- return o2.getValue().compareTo(o1.getValue()); // descending
- // order
-
- }
- });
-
- this.locationNameEntities.clear();// update so that they are in
- // descending order
- for (Map.Entry<String, Integer> entry : list) {
- this.locationNameEntities.add(entry.getKey());
- if (entry.getValue() > max) {
- max = entry.getValue();
- this.bestNameEntity = entry.getKey();
- }
- }
- }
+ ArrayList<String> locationNameEntities;
+ String bestNameEntity;
+ private HashMap<String, Integer> tf;
+ private final NameFinderME nameFinder;
+
+ public NameEntityExtractor(URL modelUrl) throws IOException {
+ this.locationNameEntities = new ArrayList<String>();
+ this.bestNameEntity = null;
+ TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
+ this.nameFinder = new NameFinderME(model);
+ this.tf = new HashMap<String, Integer>();
+ }
+
+ /*
+ * Use OpenNLP to extract location names that's appearing in the steam.
+ * OpenNLP's default Name Finder accuracy is not very good, please refer to
+ * its documentation.
+ *
+ * @param stream stream that passed from this.parse()
+ */
+ public void getAllNameEntitiesfromInput(InputStream stream) throws IOException {
+ String[] in = IOUtils.toString(stream, UTF_8).split(" ");
+ Span nameE[];
+
+ //name finder is not thread safe https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind
+ synchronized (nameFinder) {
+ nameE = nameFinder.find(in);
+ //the same name finder is reused, so clear adaptive data
+ nameFinder.clearAdaptiveData();
+ }
+
+ String spanNames = Arrays.toString(Span.spansToStrings(nameE, in));
+ spanNames = spanNames.substring(1, spanNames.length() - 1);
+ String[] tmp = spanNames.split(",");
+
+ for (String name : tmp) {
+ name = name.trim();
+ this.locationNameEntities.add(name);
+ }
+
+
+ }
+
+ /*
+ * Get the best location entity extracted from the input stream. Simply
+ * return the most frequent entity, If there several highest frequent
+ * entity, pick one randomly. May not be the optimal solution, but works.
+ *
+ * @param locationNameEntities OpenNLP name finder's results, stored in
+ * ArrayList
+ */
+ public void getBestNameEntity() {
+ if (this.locationNameEntities.size() == 0)
+ return;
+
+ for (int i = 0; i < this.locationNameEntities.size(); ++i) {
+ if (tf.containsKey(this.locationNameEntities.get(i)))
+ tf.put(this.locationNameEntities.get(i),
+ tf.get(this.locationNameEntities.get(i)) + 1);
+ else
+ tf.put(this.locationNameEntities.get(i), 1);
+ }
+ int max = 0;
+ List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(
+ tf.entrySet());
+ Collections.shuffle(list);
+ Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
+ public int compare(Map.Entry<String, Integer> o1,
+ Map.Entry<String, Integer> o2) {
+ // Descending Order
+ return o2.getValue().compareTo(o1.getValue());
+ }
+ });
+
+ this.locationNameEntities.clear();// update so that they are in
+ // descending order
+ for (Map.Entry<String, Integer> entry : list) {
+ this.locationNameEntities.add(entry.getKey());
+ if (entry.getValue() > max) {
+ max = entry.getValue();
+ this.bestNameEntity = entry.getKey();
+ }
+ }
+ }
}