You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by if...@apache.org on 2020/06/07 22:42:47 UTC
[incubator-nlpcraft] 05/06: NLPCRAFT-67: Add batching
This is an automated email from the ASF dual-hosted git repository.
ifropc pushed a commit to branch NLPCRAFT-67
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit b614527d717a830684bbdbacc9ead7e61091e7af
Author: Ifropc <if...@apache.org>
AuthorDate: Sun Jun 7 14:21:02 2020 -0700
NLPCRAFT-67: Add batching
---
src/main/python/ctxword/bertft/bertft.py | 121 ++++++++++-----------
src/main/python/ctxword/bin/predict.sh | 2 +-
.../ctxword/jupyter/Trasnsformers-FastText.ipynb | 25 +++--
src/main/python/ctxword/server.py | 11 +-
4 files changed, 82 insertions(+), 77 deletions(-)
diff --git a/src/main/python/ctxword/bertft/bertft.py b/src/main/python/ctxword/bertft/bertft.py
index 6d27d7a..adb37a9 100644
--- a/src/main/python/ctxword/bertft/bertft.py
+++ b/src/main/python/ctxword/bertft/bertft.py
@@ -33,10 +33,6 @@ def lget(lst, pos):
return list(map(lambda x: x[pos], lst))
-def calc_w(x, y, w):
- return x * w[0] + y * w[1]
-
-
# TODO: make Model configurable
# TODO: add type check
class Pipeline:
@@ -77,11 +73,9 @@ class Pipeline:
self.tokenizer = AutoTokenizer.from_pretrained("roberta-large")
self.model = AutoModelWithLMHead.from_pretrained("roberta-large")
- self.on_run = on_run
-
self.log.info("Server started in %s seconds", ('{0:.4f}'.format(time.time() - start_time)))
- def find_top(self, sentence, index, k, top_bert, bert_norm, min_ftext, weights, min_score):
+ def find_top(self, input_data, k, top_bert, min_ftext, weights, min_score):
tokenizer = self.tokenizer
model = self.model
ft = self.ft
@@ -89,88 +83,93 @@ class Pipeline:
k = 10 if k is None else k
min_score = 0 if min_score is None else min_score
- self.log.debug("Input: %s", sentence)
start_time = time.time()
+ req_start_time = start_time
- lst = sentence.split()
+ sentences = list(map(lambda x: self.replace_with_mask(x[0], x[1]), input_data))
- target = lst[index]
+ encoded = tokenizer.batch_encode_plus(list(map(lambda x: x[1], sentences)), pad_to_max_length=True)
+ input_ids = torch.tensor(encoded['input_ids'])
+ attention_mask = torch.tensor(encoded['attention_mask'])
- seqlst = lst[:index]
- seqlst.append(tokenizer.mask_token)
- seqlst.extend(lst[(index + 1):])
- sequence = " ".join(seqlst)
+ start_time = self.print_time(start_time, "Tokenizing finished")
+ forward = model(input_ids=input_ids, attention_mask=attention_mask)
- self.log.debug("Target word: %s; sequence: %s", target, sequence)
+ start_time = self.print_time(start_time, "Batch finished (Bert)")
- input = tokenizer.encode(sequence, return_tensors="pt")
- mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
-
- token_logits = model(input)[0]
+ mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]
+ token_logits = forward[0]
mask_token_logits = token_logits[0, mask_token_index, :]
# Filter top <top_bert> results of bert output
topk = torch.topk(mask_token_logits, top_bert, dim=1)
- top_tokens = list(zip(topk.indices[0].tolist(), topk.values[0].tolist()))
- unfiltered = list()
- filtered = list()
+ nvl = []
+
+ for d in topk.values:
+ nmin = torch.min(d)
+ nmax = torch.max(d)
+ nvl.append((d - nmin) / (nmax - nmin))
- norm_d = top_tokens[bert_norm - 1][1]
- norm_k = top_tokens[0][1] - norm_d
+ start_time = self.print_time(start_time, "Bert post-processing")
- self.log.info("Bert finished in %s seconds", '{0:.4f}'.format(time.time() - start_time))
+ suggestions = []
+ for index in topk.indices:
+ lst = list(index)
+ tmp = []
+ for single in lst:
+ tmp.append(tokenizer.decode([single]).strip())
+ suggestions.append(tuple(tmp))
- # Filter bert output by <min_ftext>
- # TODO: calculate batch similarity
- for token, value in top_tokens:
- word = tokenizer.decode([token]).strip()
- norm_value = (value - norm_d) / norm_k
+ start_time = self.print_time(start_time, "Bert decoding")
- sim = cosine_similarity(ft[target].reshape(1, -1), ft[word].reshape(1, -1))[0][0]
+ cos = torch.nn.CosineSimilarity()
- sentence_sim = cosine_similarity(
- ft.get_sentence_vector(sentence).reshape(1, -1),
- ft.get_sentence_vector(re.sub(tokenizer.mask_token, word, sequence)).reshape(1, -1)
- )[0][0]
+ result = []
- # Continue only for jupyter
- if self.on_run is None and word == target:
- continue
+ for i in range(0, len(sentences)):
+ target = sentences[i][0]
+ suggest_embeddings = torch.tensor(list(map(lambda x: ft[x], suggestions[i])))
+ targ_tenzsor = torch.tensor(ft[target]).expand(suggest_embeddings.shape)
+ similarities = cos(targ_tenzsor, suggest_embeddings)
- score = calc_w(norm_value, sim, weights)
+ scores = nvl[i] * weights[0] + similarities * weights[1]
- if sim >= min_ftext and score > min_score:
- filtered.append((word, value, norm_value, sim, sentence_sim, score))
+ result.append(
+ sorted(
+ filter(
+ lambda x: x[0] > min_score and x[1] > min_ftext,
+ zip(scores.tolist(), similarities.tolist(), suggestions[i], nvl[i].tolist())
+ ),
+ key=lambda x: x[0],
+ reverse=True
+ )[:k]
+ )
- unfiltered.append((word, value, norm_value, sim, sentence_sim, score))
+ self.print_time(start_time, "Fast text similarities found")
- done = (time.time() - start_time)
+ self.print_time(req_start_time, "Request processed")
- kfiltered = filtered[:k]
- kunfiltered = unfiltered[:k]
+ return result
- kfiltered = sorted(kfiltered, key=lambda x: -x[len(x) - 1])
- kunfiltered = sorted(kunfiltered, key=lambda x: -x[len(x) - 1])
+ def replace_with_mask(self, sentence, index):
+ lst = sentence.split()
- filtered_top = pd.DataFrame({
- 'word': lget(kfiltered, 0),
- 'bert': self.dget(kfiltered, 1),
- 'normalized': self.dget(kfiltered, 2),
- 'ftext': self.dget(kfiltered, 3),
- 'ftext-sentence': self.dget(kfiltered, 4),
- 'score': lget(kfiltered, 5),
- })
+ target = lst[index]
- if self.on_run != None:
- self.on_run(self, kunfiltered, unfiltered, filtered_top, target, tokenizer, top_tokens)
+ seqlst = lst[:index]
+ seqlst.append(self.tokenizer.mask_token)
+ seqlst.extend(lst[(index + 1):])
- self.log.info("Processing finished in %s seconds", '{0:.4f}'.format(done))
+ return (target, " ".join(seqlst))
- return filtered_top
+ def print_time(self, start, message):
+ current = time.time()
+ self.log.info(message + " in %s ms", '{0:.4f}'.format((current - start) * 1000))
+ return current
- def do_find(self, s, index, limit, min_score):
- return self.find_top(s, index, limit, 200, 200, 0.25, [1, 1], min_score)
+ def do_find(self, data, limit, min_score):
+ return self.find_top(data, limit, 100, 0.25, [1, 1], min_score)
def dget(self, lst, pos):
return list(map(lambda x: '{0:.2f}'.format(x[pos]), lst)) if self.on_run is not None else lget(lst, pos)
diff --git a/src/main/python/ctxword/bin/predict.sh b/src/main/python/ctxword/bin/predict.sh
index 0fd5f22..568c17a 100755
--- a/src/main/python/ctxword/bin/predict.sh
+++ b/src/main/python/ctxword/bin/predict.sh
@@ -16,4 +16,4 @@
# limitations under the License.
#
-curl -d "{\"sentence\": \"$1\",\"simple\": true, \"index\": $2, \"limit\": 10}" -H 'Content-Type: application/json' http://localhost:5000/suggestions
+curl -d "{\"sentences\": [[\"$1\", $2]], \"simple\": true, \"limit\": 10}" -H 'Content-Type: application/json' http://localhost:5000/suggestions
diff --git a/src/main/python/ctxword/jupyter/Trasnsformers-FastText.ipynb b/src/main/python/ctxword/jupyter/Trasnsformers-FastText.ipynb
index 8330cb0..59a000c 100644
--- a/src/main/python/ctxword/jupyter/Trasnsformers-FastText.ipynb
+++ b/src/main/python/ctxword/jupyter/Trasnsformers-FastText.ipynb
@@ -143,7 +143,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline = bertft.Pipeline(on_run)"
+ "pipeline = bertft.Pipeline()"
]
},
{
@@ -153,18 +153,25 @@
"outputs": [],
"source": [
"# Example of usage\n",
- "x = pipeline.find_top(\n",
- " \"what is the local weather forecast?\", # mark target word with #\n",
- " [4, 4], # or pass words position range (inclusive) in the sentece\n",
+ "res = pipeline.find_top(\n",
+ " # List of sentences with target word position\n",
+ " [\n",
+ " (\"what is the local weather forecast?\", 4),\n",
+ " (\"what is chances of rain tomorrow?\", 4),\n",
+ " (\"what is chances of rain tomorrow?\", 2),\n",
+ " (\"is driving a car faster then taking a bus?\", 3),\n",
+ " (\"who is the best football player of all time?\", 4)\n",
+ " ],\n",
" k = 20, # Filter best k results (by weighted score)\n",
- " top_bert = 200, # Number of initial filter of bert output \n",
- " bert_norm = 200, # Use this position for normalization of bert output \n",
+ " top_bert = 100, # Number of initial filter of bert output \n",
" min_ftext = 0.3, # Minimal required score of fast text \n",
" weights = [ # Weights of models scores to calculate total weighted score\n",
" 1, # bert\n",
" 1, # fast text\n",
- " ]\n",
- ")"
+ " ],\n",
+ " min_score = 0 # Minimum required score\n",
+ ")\n",
+ "print(res)"
]
},
{
@@ -191,7 +198,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.2"
+ "version": "3.8.3"
}
},
"nbformat": 4,
diff --git a/src/main/python/ctxword/server.py b/src/main/python/ctxword/server.py
index a5899a0..9846b56 100644
--- a/src/main/python/ctxword/server.py
+++ b/src/main/python/ctxword/server.py
@@ -17,6 +17,7 @@
import logging
from flask import Flask
from flask import request
+from flask import jsonify
from bertft import Pipeline
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
@@ -55,14 +56,12 @@ def main():
json = request.json
- sentence = present(json, 'sentence')
- index = present(json, 'index')
+ sentences = present(json, 'sentences')
limit = json['limit'] if 'limit' in json else None
min_score = json['min_score'] if 'min_score' in json else None
- data = pipeline.do_find(sentence, index, limit, min_score)
+ data = pipeline.do_find(sentences, limit, min_score)
if 'simple' not in json or not json['simple']:
- json_data = data.to_json(orient='table', index=False)
+ return jsonify(data)
else:
- json_data = data['word'].to_json(orient='values')
- return app.response_class(response=json_data, status=200, mimetype='application/json')
+ return jsonify(list(map(lambda x: list(map(lambda y: y[2], x)), data)))