Google has introduced Universal Sentence Encoder which has been a very useful tool in the NLP domain. The main advantages of this embedding is that it is able to understand context, is trained on vast amount of data, and produces same shape vector for words, sentences and paragraphs thus making it easy to compare the vector space to find similiar embeddings in the vector space.
import tensorflow as tf
import tensorflow_hub as hub
import os
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
assert float(".".join(tf.__version__.split('.')[:2])) >= 2.2, "You need to download tf2"
assert float(".".join(hub.__version__.split('.')[:2])) >= 0.8, "You need to download tf-hub 0.8.0"
I omitted Google's universal encoder model due to size constraints, it can be donwloaded from https://tfhub.dev/google/universal-sentence-encoder/4?tf-hub-format=compressed
https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
The training data is about 40 Mb, and for a list of topics (~440), there are questions and answers from context for a list of context. Example: One topic can have multiple context where in each context, there can be N question and answer pairs.
+--
+-- Google-Universal-Encoder-v4
| +-- model files ...
+-- data
| +-- train-v2.0.json
location = os.path.join(os.getcwd(), 'Google-Universal-Encode-v4')
model = hub.load(location)
def embed(input):
return model(input)
'''
Import data into the notebook
'''
import json
new_path = os.path.join(os.getcwd(), 'data', 'train-v2.0.json')
training_data = json.load(open(new_path))
number_of_topics = 50
list_of_topics = [i['title'] for i in training_data['data'][0:number_of_topics]]
print("List of topics examined : ", ", ".join(list_of_topics))
dataset = []
for topic_area in training_data['data'][0:number_of_topics]:
list_of_paragraphs = topic_area['paragraphs']
for paragraph in list_of_paragraphs:
context = paragraph['context']
list_of_questions = paragraph['qas']
for question in list_of_questions:
list_of_answers = question['answers']
'''
Get the first answer from the list of answers if an answer exists, else ""
'''
dataset.append({'topic_area' : topic_area['title'],
'question' : question['question'],
'no_answer' : question['is_impossible'],
'context' : context,
'answer' : "" if len(list_of_answers) == 0 else list_of_answers[0]})
from sklearn.metrics.pairwise import cosine_similarity
n_highest = 10
question_embeddings = embed([item['question'] for item in dataset])
def get_closest_matches(question="Who did Kanye produce Graduation with?"):
assert isinstance(question, str), "Question must be a string of text"
new_question_embedding = embed([question])
similarity = cosine_similarity(new_question_embedding, question_embeddings).flatten()
top_results = similarity.argsort()[::-1][:n_highest]
return top_results, similarity
def print_results(top_results, similarity):
for result in top_results:
print("Question : {} with similarity of {}%".format(dataset[result]['question'], round(similarity[result] * 100, 1)))
print("No answer" if dataset[result]['no_answer'] == True else "Answer : {}".format(dataset[result]['answer']['text']))
print("No context" if dataset[result]['no_answer'] == True else "Context : {}".format(dataset[result]['context'][max(0,dataset[result]['answer']['answer_start'] - 50):min(len(dataset[result]['context']), dataset[result]['answer']['answer_start'] + 50)]))
print("\n")
tr, sim = get_closest_matches("Who did Kanye produce Graduation with?")
print("Question : {}".format("Who did Kanye produce Graduation with?"))
print("-----------------------------------------------------------")
print("Top Results ..... \n ")
print_results(tr, sim)
Below are the results to retrievew top 10 matching questions, answers, and context from a list of 2685 questions.
print(question_embeddings.shape)
%timeit get_closest_matches()