An error occurred while loading the file. Please try again.
-
Artem Baranovskyi authored9685721c
import os
import sys
import time
import numpy as np
import pandas as pd
# UP
import pickle
import argparse
from sklearn import metrics
from sentence_transformers import models, SentenceTransformer
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate, cross_val_predict
"""
This script has been adapted from the original script authored by Yunus Eryilmaz.
This script has been modified to adapt the source and structure of input-output data
for specific use case (data is given as params, result returns as an array instead of files).
"""
__author__ = "Yunus Eryilmaz"
__version__ = "1.0"
__date__ = "21.07.2021"
__source__ = "https://pypi.org/project/sentence-transformers/0.3.0/"
__source__ = "https://transfer.hft-stuttgart.de/gitlab/ulrike.pado/ASYST/-/blob/main/Source/Skript/german/run_LR_SBERT.py"
__adapted_by__ = "Artem Baranovskyi"
__adaptation_date__ = "14.09.2024"
__adaptation_version__ = "1.0"
def process_data(data):
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_dir",
default="/app/asyst/Source/Skript/german/models",
type=str,
required=False,
help="The directory where the ML models are stored.",
)
args = parser.parse_args()
referenceAnswer = data['referenceAnswer']
studentAnswers = data['studentAnswers']
# Use BERT for mapping tokens to embeddings
word_embedding_model = models.Transformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
# pooling operation can choose by setting true (Apply mean pooling to get one fixed sized sentence vector)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False)
# compute the sentence embeddings for both sentences
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
sentence_embeddings1 = model.encode([referenceAnswer] * len(studentAnswers), convert_to_tensor=True, show_progress_bar=False)
sentence_embeddings2 = model.encode(studentAnswers, convert_to_tensor=True, show_progress_bar=False)
computed_simis_test = similarity(sentence_embeddings1, sentence_embeddings2)
X_test = computed_simis_test
# UP: read pre-trained LR model
clf_log = pickle.load(open("/app/asyst/Source/Skript/german/models/clf_BERT.pickle", "rb"))
predictions = clf_log.predict(X_test)
results = []
for i in range(len(studentAnswers)):
result = {
"predicted_grade": "correct" if predictions[i] == 1 else "incorrect"
}
results.append(result)
return results
# Possible concatenations from the embedded sentences can be selected
def similarity(sentence_embeddings1, sentence_embeddings2):
# I2=(|u − v| + u ∗ v)
simi = abs(np.subtract(sentence_embeddings1, sentence_embeddings2)) + np.multiply(sentence_embeddings1,
sentence_embeddings2)
return simi