An error occurred while loading the file. Please try again.
run_LR_SBERT.py 3.18 KiB
import os
import sys
import time
import numpy as np
import pandas as pd
# UP
import pickle
import argparse
from sklearn import metrics
from sentence_transformers import models, SentenceTransformer
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate, cross_val_predict
"""
This script has been adapted from the original script authored by Yunus Eryilmaz.
This script has been modified to adapt the source and structure of input-output data
for specific use case (data is given as params, result returns as an array instead of files).
"""
__author__ = "Yunus Eryilmaz"
__version__ = "1.0"
__date__ = "21.07.2021"
__source__ = "https://pypi.org/project/sentence-transformers/0.3.0/"
__source__ = "https://transfer.hft-stuttgart.de/gitlab/ulrike.pado/ASYST/-/blob/main/Source/Skript/german/run_LR_SBERT.py"
__adapted_by__ = "Artem Baranovskyi"
__adaptation_date__ = "14.09.2024"
__adaptation_version__ = "1.0"
def process_data(data):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_dir",
        default="/app/asyst/Source/Skript/german/models",
        type=str,
        required=False,
        help="The directory where the ML models are stored.",
    args = parser.parse_args()
    referenceAnswer = data['referenceAnswer']
    studentAnswers = data['studentAnswers']
    # Use BERT for mapping tokens to embeddings
    word_embedding_model = models.Transformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    # pooling operation can choose by setting true (Apply mean pooling to get one fixed sized sentence vector)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)
    # compute the sentence embeddings for both sentences
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    sentence_embeddings1 = model.encode([referenceAnswer] * len(studentAnswers), convert_to_tensor=True, show_progress_bar=False)
    sentence_embeddings2 = model.encode(studentAnswers, convert_to_tensor=True, show_progress_bar=False)
    computed_simis_test = similarity(sentence_embeddings1, sentence_embeddings2)
    X_test = computed_simis_test
    # UP: read pre-trained LR model
    clf_log = pickle.load(open("/app/asyst/Source/Skript/german/models/clf_BERT.pickle", "rb"))
    predictions = clf_log.predict(X_test)
results = [] for i in range(len(studentAnswers)): result = { "predicted_grade": "correct" if predictions[i] == 1 else "incorrect" } results.append(result) return results # Possible concatenations from the embedded sentences can be selected def similarity(sentence_embeddings1, sentence_embeddings2): # I2=(|u − v| + u ∗ v) simi = abs(np.subtract(sentence_embeddings1, sentence_embeddings2)) + np.multiply(sentence_embeddings1, sentence_embeddings2) return simi