You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/10/19 10:01:42 UTC

[opennlp] branch master updated: OPENNLP-1144: Add word vector support

This is an automated email from the ASF dual-hosted git repository.

joern pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/master by this push:
     new a800c6a  OPENNLP-1144: Add word vector support
a800c6a is described below

commit a800c6ae0dc4e6badcbc9f3812a69422f86314df
Author: Jörn Kottmann <jo...@apache.org>
AuthorDate: Mon Oct 9 20:48:51 2017 +0200

    OPENNLP-1144: Add word vector support
---
 .../tools/util/wordvector/DoubleArrayVector.java   | 74 +++++++++++++++++++
 .../tools/util/wordvector/FloatArrayVector.java    | 74 +++++++++++++++++++
 .../java/opennlp/tools/util/wordvector/Glove.java  | 82 ++++++++++++++++++++++
 .../tools/util/wordvector/MapWordVectorTable.java  | 49 +++++++++++++
 .../opennlp/tools/util/wordvector/WordVector.java  | 45 ++++++++++++
 .../tools/util/wordvector/WordVectorTable.java     | 35 +++++++++
 .../tools/util/wordvector/WordVectorType.java      | 26 +++++++
 7 files changed, 385 insertions(+)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/DoubleArrayVector.java b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/DoubleArrayVector.java
new file mode 100644
index 0000000..dcbee86
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/DoubleArrayVector.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.wordvector;
+
+import java.nio.DoubleBuffer;
+import java.nio.FloatBuffer;
+
+class DoubleArrayVector implements WordVector {
+
+  private double[] vector;
+
+  DoubleArrayVector(double[] vector) {
+    this.vector = vector;
+  }
+
+  @Override
+  public WordVectorType getDataType() {
+    return WordVectorType.DOUBLE;
+  }
+
+  @Override
+  public float getAsFloat(int index) {
+    return (float) getAsDouble(index);
+  }
+
+  @Override
+  public double getAsDouble(int index) {
+    return vector[index];
+  }
+
+  @Override
+  public float[] toFloatArray() {
+    float[] floatVector = new float[vector.length];
+    for (int i = 0; i < floatVector.length ; i++) {
+      floatVector[i] = (float) vector[i];
+    }
+    return floatVector;
+  }
+
+  @Override
+  public double[] toDoubleArray() {
+    return toDoubleBuffer().array();
+  }
+
+  @Override
+  public FloatBuffer toFloatBuffer() {
+    return FloatBuffer.wrap(toFloatArray()).asReadOnlyBuffer();
+  }
+
+  @Override
+  public DoubleBuffer toDoubleBuffer() {
+    return DoubleBuffer.wrap(vector);
+  }
+
+  @Override
+  public int dimension() {
+    return vector.length;
+  }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/FloatArrayVector.java b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/FloatArrayVector.java
new file mode 100644
index 0000000..c6be1f1
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/FloatArrayVector.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.wordvector;
+
+import java.nio.DoubleBuffer;
+import java.nio.FloatBuffer;
+
+class FloatArrayVector implements WordVector {
+
+  private float[] vector;
+
+  FloatArrayVector(float[] vector) {
+    this.vector = vector;
+  }
+
+  @Override
+  public WordVectorType getDataType() {
+    return WordVectorType.FLOAT;
+  }
+
+  @Override
+  public float getAsFloat(int index) {
+    return vector[index];
+  }
+
+  @Override
+  public double getAsDouble(int index) {
+    return getAsFloat(index);
+  }
+
+  @Override
+  public float[] toFloatArray() {
+    return toFloatBuffer().array();
+  }
+
+  @Override
+  public double[] toDoubleArray() {
+    double[] doubleVector = new double[vector.length];
+    for (int i = 0; i < doubleVector.length ; i++) {
+      doubleVector[i] = vector[i];
+    }
+    return doubleVector;
+  }
+
+  @Override
+  public FloatBuffer toFloatBuffer() {
+    return FloatBuffer.wrap(vector).asReadOnlyBuffer();
+  }
+
+  @Override
+  public DoubleBuffer toDoubleBuffer() {
+    return DoubleBuffer.wrap(toDoubleArray());
+  }
+
+  @Override
+  public int dimension() {
+    return vector.length;
+  }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/Glove.java b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/Glove.java
new file mode 100644
index 0000000..c0c4053
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/Glove.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.wordvector;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.util.java.Experimental;
+
+/**
+ * <p>
+ * Warning: Experimental new feature, see OPENNLP-1144 for details, the API might be changed anytime.
+ */
+@Experimental
+public class Glove {
+
+  private Glove() {
+  }
+
+  /**
+   * Parses a glove vector plain text file.
+   * <p>
+   * Warning: Experimental new feature, see OPENNLP-1144 for details, the API might be changed anytime.
+   *
+   * @param in
+   * @return
+   * @throws IOException
+   */
+  @Experimental
+  public static WordVectorTable parse(InputStream in) throws IOException {
+    BufferedReader reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8),
+        1024 * 1024);
+
+    Map<String, WordVector> vectors = new HashMap<>();
+
+    int dimension = -1;
+    String line;
+    while ((line = reader.readLine()) != null) {
+      String[] parts = line.split(" ");
+
+      if (dimension == -1) {
+        dimension = parts.length - 1;
+      }
+      else if (dimension != parts.length - 1) {
+        throw new IOException("Vector dimension must be constant!");
+      }
+
+      String token = parts[0];
+
+      float[] vector = new float[dimension];
+
+      for (int i = 0; i < vector.length; i++) {
+        vector[i] = Float.parseFloat(parts[i + 1]);
+      }
+
+      vectors.put(token, new FloatArrayVector(vector));
+    }
+
+    return new MapWordVectorTable(Collections.unmodifiableMap(vectors));
+  }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/MapWordVectorTable.java b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/MapWordVectorTable.java
new file mode 100644
index 0000000..83776d8
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/MapWordVectorTable.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.wordvector;
+
+import java.util.Map;
+
+class MapWordVectorTable implements WordVectorTable {
+
+  private final Map<String, WordVector> vectors;
+
+  MapWordVectorTable(Map<String, WordVector> vectors) {
+    this.vectors = vectors;
+  }
+
+  @Override
+  public WordVector get(CharSequence token) {
+    return vectors.get(token.toString());
+  }
+
+  @Override
+  public int size() {
+    return vectors.size();
+  }
+
+  @Override
+  public int dimension() {
+    if (vectors.size() > 0) {
+      return vectors.values().iterator().next().dimension();
+    }
+    else {
+      return -1;
+    }
+  }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVector.java b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVector.java
new file mode 100644
index 0000000..94acb54
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVector.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.wordvector;
+
+import java.nio.DoubleBuffer;
+import java.nio.FloatBuffer;
+
+import opennlp.tools.util.java.Experimental;
+
+/**
+ * A word vector.
+ *
+ * <p>
+ * Warning: Experimental new feature, see OPENNLP-1144 for details, the API might be changed anytime.
+ */
+@Experimental
+public interface WordVector {
+  WordVectorType getDataType();
+
+  float getAsFloat(int index);
+  double getAsDouble(int index);
+
+  float[] toFloatArray();
+  double[] toDoubleArray();
+
+  FloatBuffer toFloatBuffer();
+  DoubleBuffer toDoubleBuffer();
+
+  int dimension();
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVectorTable.java b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVectorTable.java
new file mode 100644
index 0000000..bf00858
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVectorTable.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.wordvector;
+
+import opennlp.tools.util.java.Experimental;
+
+/**
+ * A table that maps tokens to word vectors.
+ *
+ * <p>
+ * Warning: Experimental new feature, see OPENNLP-1144 for details, the API might be changed anytime.
+ */
+@Experimental
+public interface WordVectorTable {
+  WordVector get(CharSequence token);
+
+  int size();
+
+  int dimension();
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVectorType.java b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVectorType.java
new file mode 100644
index 0000000..0e52652
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVectorType.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.wordvector;
+
+import opennlp.tools.util.java.Experimental;
+
+@Experimental
+public enum WordVectorType {
+  FLOAT,
+  DOUBLE;
+}

-- 
To stop receiving notification emails like this one, please contact
['"commits@opennlp.apache.org" <co...@opennlp.apache.org>'].