run_LR_SBERT.py 5.17 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
import sys
import time
import numpy as np
import pandas as pd

# UP
import pickle
import argparse

from sklearn import metrics
from sentence_transformers import models, SentenceTransformer
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate, cross_val_predict

__author__ = "Yunus Eryilmaz"
__version__ = "1.0"
__date__ = "21.07.2021"
__source__ = "https://pypi.org/project/sentence-transformers/0.3.0/"



def main():
    parser = argparse.ArgumentParser()

    # Where are we?
    location = ".";
    if getattr(sys, 'frozen', False):
        # running in a bundle
        location = sys._MEIPASS

    # Required parameters
    parser.add_argument(
        "--data",
        #default=None,
        default="/var/www/html/moodle/asyst/Source/Skript/outputs/test.tsv",
        type=str,
        required=False,
        help="The input data file for the task.",
    )
    parser.add_argument(
        "--output_dir",
        # default=None,
        default="/var/www/html/moodle/asyst/Source/Skript/outputs",
        type=str,
        required=False,
        help="The output directory where predictions will be written.",
    )
    parser.add_argument(
        "--model_dir",
        # default=None,
        default=location+"/Skript/german/models",
        type=str,
        required=False,
        help="The directory where the ML models are stored.",
    )
58
59
60
61
62
63
64
    parser.add_argument(
        "--transformer_model_dir",
        default="/var/www/html/moodle/sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2",
        type=str,
        required=False,
        help="The directory where the SentenceTransformer model is stored.",
    )
65
66
67
68
69
    args = parser.parse_args()

    # open a log file next to the executable with line buffering
    # out = open("log.txt", "a",buffering=1);

70
    # print("Started German processing in", location, file=out);
71
72
73
74

    # import SentenceTransformer-model
    start_time = time.time()

75
    # print("Reading from", args.data, file=out);
76

77
    with open(args.data) as ft:
78
79
80
81
82
        dft = pd.read_csv(ft, delimiter='\t')

    # Sentences we want sentence embeddings for
    sentences1_test = dft['referenceAnswer'].values.tolist()
    sentences2_test = dft['studentAnswer'].values.tolist()
83
    # print("Input read:", sentences2_test, file=out);
84
85

    # Use BERT for mapping tokens to embeddings
86
    word_embedding_model = models.Transformer(args.transformer_model_dir)
87
88
89
90
91
92
93
94
95
96
97
    # pooling operation can choose by setting true (Apply mean pooling to get one fixed sized sentence vector)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)

    # compute the sentence embeddings for both sentences
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    # print("Model loaded", file=out);

    sentence_embeddings1_test = model.encode(sentences1_test, convert_to_tensor=True, show_progress_bar=False)
98
    # print("Embeddings RefA:", sentence_embeddings1_test, file=out);
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

    sentence_embeddings2_test = model.encode(sentences2_test, convert_to_tensor=True, show_progress_bar=False)
    # print("Embeddings found", file=out);

    # Possible concatenations from the embedded sentences can be selected
    def similarity(sentence_embeddings1, sentence_embeddings2):
        # I2=(|u − v| + u ∗ v)
        simi = abs(np.subtract(sentence_embeddings1, sentence_embeddings2)) + np.multiply(sentence_embeddings1,
                                                                                          sentence_embeddings2)

        return simi

    # calls the similarity function and get the concatenated values between the sentence embeddings
    computed_simis_test = similarity(sentence_embeddings1_test, sentence_embeddings2_test)

114
    # get the sentence embeddings and the labels for train and test
115
116
117
118
119
120
121
122
123
124
125
126

    X_test = computed_simis_test
    # Y_test = np.array(dft['label'])

    # UP: read pre-trained LR model
    clf_log = pickle.load(open("/var/www/html/moodle/asyst/Source/Skript/german/models/clf_BERT.pickle", "rb"))

    # print('--------Evaluate on Testset------- ', file=out)
    predictions = clf_log.predict(X_test)

    # UP print results
    with open(args.output_dir + "/predictions.txt", "w") as writer:
127
        # TODO: write results to plugins DB Table
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
        writer.write("question\treferenceAnswer\tstudentAnswer\tsuggested grade\tobserved grade\n")
        for i in range(len(dft)):
            hrpred = "incorrect"
            if predictions[i] == 1:
                hrpred = "correct"
            writer.write(
                str(dft.iloc[i][0])
                + "\t"
                + str(dft.iloc[i][1])
                + "\t"
                + str(dft.iloc[i][2])
                + "\t"
                + str(hrpred)
                + "\t"
                + str(dft.iloc[i][3])
                + "\n"
            )

146
    # print('\nExecution time:', time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)), file=out)
147
148
149
150


if __name__ == "__main__":
    main()