You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/10/19 10:01:42 UTC
[opennlp] branch master updated: OPENNLP-1144: Add word vector
support
This is an automated email from the ASF dual-hosted git repository.
joern pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new a800c6a OPENNLP-1144: Add word vector support
a800c6a is described below
commit a800c6ae0dc4e6badcbc9f3812a69422f86314df
Author: Jörn Kottmann <jo...@apache.org>
AuthorDate: Mon Oct 9 20:48:51 2017 +0200
OPENNLP-1144: Add word vector support
---
.../tools/util/wordvector/DoubleArrayVector.java | 74 +++++++++++++++++++
.../tools/util/wordvector/FloatArrayVector.java | 74 +++++++++++++++++++
.../java/opennlp/tools/util/wordvector/Glove.java | 82 ++++++++++++++++++++++
.../tools/util/wordvector/MapWordVectorTable.java | 49 +++++++++++++
.../opennlp/tools/util/wordvector/WordVector.java | 45 ++++++++++++
.../tools/util/wordvector/WordVectorTable.java | 35 +++++++++
.../tools/util/wordvector/WordVectorType.java | 26 +++++++
7 files changed, 385 insertions(+)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/DoubleArrayVector.java b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/DoubleArrayVector.java
new file mode 100644
index 0000000..dcbee86
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/DoubleArrayVector.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.wordvector;
+
+import java.nio.DoubleBuffer;
+import java.nio.FloatBuffer;
+
+class DoubleArrayVector implements WordVector {
+
+ private double[] vector;
+
+ DoubleArrayVector(double[] vector) {
+ this.vector = vector;
+ }
+
+ @Override
+ public WordVectorType getDataType() {
+ return WordVectorType.DOUBLE;
+ }
+
+ @Override
+ public float getAsFloat(int index) {
+ return (float) getAsDouble(index);
+ }
+
+ @Override
+ public double getAsDouble(int index) {
+ return vector[index];
+ }
+
+ @Override
+ public float[] toFloatArray() {
+ float[] floatVector = new float[vector.length];
+ for (int i = 0; i < floatVector.length ; i++) {
+ floatVector[i] = (float) vector[i];
+ }
+ return floatVector;
+ }
+
+ @Override
+ public double[] toDoubleArray() {
+ return toDoubleBuffer().array();
+ }
+
+ @Override
+ public FloatBuffer toFloatBuffer() {
+ return FloatBuffer.wrap(toFloatArray()).asReadOnlyBuffer();
+ }
+
+ @Override
+ public DoubleBuffer toDoubleBuffer() {
+ return DoubleBuffer.wrap(vector);
+ }
+
+ @Override
+ public int dimension() {
+ return vector.length;
+ }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/FloatArrayVector.java b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/FloatArrayVector.java
new file mode 100644
index 0000000..c6be1f1
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/FloatArrayVector.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.wordvector;
+
+import java.nio.DoubleBuffer;
+import java.nio.FloatBuffer;
+
+class FloatArrayVector implements WordVector {
+
+ private float[] vector;
+
+ FloatArrayVector(float[] vector) {
+ this.vector = vector;
+ }
+
+ @Override
+ public WordVectorType getDataType() {
+ return WordVectorType.FLOAT;
+ }
+
+ @Override
+ public float getAsFloat(int index) {
+ return vector[index];
+ }
+
+ @Override
+ public double getAsDouble(int index) {
+ return getAsFloat(index);
+ }
+
+ @Override
+ public float[] toFloatArray() {
+ return toFloatBuffer().array();
+ }
+
+ @Override
+ public double[] toDoubleArray() {
+ double[] doubleVector = new double[vector.length];
+ for (int i = 0; i < doubleVector.length ; i++) {
+ doubleVector[i] = vector[i];
+ }
+ return doubleVector;
+ }
+
+ @Override
+ public FloatBuffer toFloatBuffer() {
+ return FloatBuffer.wrap(vector).asReadOnlyBuffer();
+ }
+
+ @Override
+ public DoubleBuffer toDoubleBuffer() {
+ return DoubleBuffer.wrap(toDoubleArray());
+ }
+
+ @Override
+ public int dimension() {
+ return vector.length;
+ }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/Glove.java b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/Glove.java
new file mode 100644
index 0000000..c0c4053
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/Glove.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.wordvector;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.util.java.Experimental;
+
+/**
+ * <p>
+ * Warning: Experimental new feature, see OPENNLP-1144 for details, the API might be changed anytime.
+ */
+@Experimental
+public class Glove {
+
+ private Glove() {
+ }
+
+ /**
+ * Parses a glove vector plain text file.
+ * <p>
+ * Warning: Experimental new feature, see OPENNLP-1144 for details, the API might be changed anytime.
+ *
+ * @param in
+ * @return
+ * @throws IOException
+ */
+ @Experimental
+ public static WordVectorTable parse(InputStream in) throws IOException {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8),
+ 1024 * 1024);
+
+ Map<String, WordVector> vectors = new HashMap<>();
+
+ int dimension = -1;
+ String line;
+ while ((line = reader.readLine()) != null) {
+ String[] parts = line.split(" ");
+
+ if (dimension == -1) {
+ dimension = parts.length - 1;
+ }
+ else if (dimension != parts.length - 1) {
+ throw new IOException("Vector dimension must be constant!");
+ }
+
+ String token = parts[0];
+
+ float[] vector = new float[dimension];
+
+ for (int i = 0; i < vector.length; i++) {
+ vector[i] = Float.parseFloat(parts[i + 1]);
+ }
+
+ vectors.put(token, new FloatArrayVector(vector));
+ }
+
+ return new MapWordVectorTable(Collections.unmodifiableMap(vectors));
+ }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/MapWordVectorTable.java b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/MapWordVectorTable.java
new file mode 100644
index 0000000..83776d8
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/MapWordVectorTable.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.wordvector;
+
+import java.util.Map;
+
+class MapWordVectorTable implements WordVectorTable {
+
+ private final Map<String, WordVector> vectors;
+
+ MapWordVectorTable(Map<String, WordVector> vectors) {
+ this.vectors = vectors;
+ }
+
+ @Override
+ public WordVector get(CharSequence token) {
+ return vectors.get(token.toString());
+ }
+
+ @Override
+ public int size() {
+ return vectors.size();
+ }
+
+ @Override
+ public int dimension() {
+ if (vectors.size() > 0) {
+ return vectors.values().iterator().next().dimension();
+ }
+ else {
+ return -1;
+ }
+ }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVector.java b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVector.java
new file mode 100644
index 0000000..94acb54
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVector.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.wordvector;
+
+import java.nio.DoubleBuffer;
+import java.nio.FloatBuffer;
+
+import opennlp.tools.util.java.Experimental;
+
+/**
+ * A word vector.
+ *
+ * <p>
+ * Warning: Experimental new feature, see OPENNLP-1144 for details, the API might be changed anytime.
+ */
+@Experimental
+public interface WordVector {
+ WordVectorType getDataType();
+
+ float getAsFloat(int index);
+ double getAsDouble(int index);
+
+ float[] toFloatArray();
+ double[] toDoubleArray();
+
+ FloatBuffer toFloatBuffer();
+ DoubleBuffer toDoubleBuffer();
+
+ int dimension();
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVectorTable.java b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVectorTable.java
new file mode 100644
index 0000000..bf00858
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVectorTable.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.wordvector;
+
+import opennlp.tools.util.java.Experimental;
+
+/**
+ * A table that maps tokens to word vectors.
+ *
+ * <p>
+ * Warning: Experimental new feature, see OPENNLP-1144 for details, the API might be changed anytime.
+ */
+@Experimental
+public interface WordVectorTable {
+ WordVector get(CharSequence token);
+
+ int size();
+
+ int dimension();
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVectorType.java b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVectorType.java
new file mode 100644
index 0000000..0e52652
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/wordvector/WordVectorType.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.wordvector;
+
+import opennlp.tools.util.java.Experimental;
+
+@Experimental
+public enum WordVectorType {
+ FLOAT,
+ DOUBLE;
+}
--
To stop receiving notification emails like this one, please contact
['"commits@opennlp.apache.org" <co...@opennlp.apache.org>'].