final changes commit

50c5a807 · Siddharth Thorat · 50c5a807 · 50c5a807 · 50c5a807 · 50c5a807
Commit 50c5a807 authored 1 year ago by Siddharth Thorat
Hide whitespace changes
Inline Side-by-side

Showing

with 1324 additions and 0 deletions
+1324 -0
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
+# Default ignored files
+/shelf/
+/workspace.xml
--- a/.idea/Asyst_code.iml
+++ b/.idea/Asyst_code.iml
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.10 (Asyst_code)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.8" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (Asyst_code)" project-jdk-type="Python SDK" />
+  <component name="PythonCompatibilityInspectionAdvertiser">
+    <option name="version" value="3" />
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Asyst_code.iml" filepath="$PROJECT_DIR$/.idea/Asyst_code.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
--- a/ASYST.exe
+++ b/ASYST.exe
--- a/DE_Demo_Daten.xlsx
+++ b/DE_Demo_Daten.xlsx
--- a/README.md
+++ b/README.md
+
+
+ASYST ist ein Werkzeug zur Unterstützung beim Bewerten von Freitextantworten. <a href="README_DE.md">Mehr Information</a>
+
+ASYST is a tool to support grading of free text answers.  <a href="README_EN.md">More information</a>
+
+
+<a href="https://www.hft-stuttgart.com/research/projects/current/knight-project"><img src="images/csm_Knight_Logo_eac7003904.webp" alt="KNIGHT-Logo" width="200"/></a> <img src="images/csm_BMBF_gefoerdert_vom_deutsch_48a18b1887.webp" alt="BMBF-Logo" width="200"/> 
+
--- a/README_DE.md
+++ b/README_DE.md
+<h2>Inhalt:</h2>
+<ul>  
+   <li><a href="#1"> Was ist ASYST? </a> </li>
+   <li><a href="#2"> Welche Sprachen unterstützt ASYST? </a></li> 
+   <li><a href="#3"> Wie verwendet man ASYST? </a></li> 
+   <ol>
+      <li><a href="#4"> Wie müssen auszuwertende Daten formatiert sein?</a> </li>
+      <li><a href="#5"> Wie führe ich das Programm unter Windows 11 aus? </a> </li>
+      
+   </ol>
+   <li><a href="#6"> Wie arbeit man mit der Ausgabe von ASYST weiter? </a></li> 
+   <li><a href="#7"> Wie kann ich ASYST ausführen, wenn ich kein Windows 11 nutze? </a> 
+         <ul><li><a href="#8"> Ausführen von ASYST in der Entwicklungsumgebung Pycharm</a> </li></ul></li>
+</ul>
+
+
+<h2 id=1>Was ist ASYST?</h2>
+
+ASYST ist ein Programm, das Lehrenden die Auswertung von Freitextantworten in Tests erleichtern soll: Mit Hilfe künstlicher Intelligenz 
+macht ASYST Bewertungsvorschläge, die von den Lehrenden gezielt überprüft und ggf. verändert werden können.
+
+ASYST ist für die Bewertung von Freitext-Kurzantworten gedacht  - diese Kurzantworten sollten etwa ein bis drei Sätze umfassen. Für längere Antworten ist die Anwendung nicht vorgesehen.
+
+ASYST hilft der Lehrperson, indem es eine Bewertung vorschlägt. Diese Bewertung kann im Einzelfall durchaus auch falsch sein; die Lehrperson kann sie prüfen und korrigieren. 
+Dabei spart man gegenüber der völlig manuellen Bewertung an zwei Stellen Zeit: Zum Einen ist das Überprüfen von Bewertungen im Allgemeinen schneller als das Bewerten von Grund auf; 
+und zum anderen empfehlen wir, bei der Überprüfung gezielt die ASYST-Bewertungen auszuwählen, die eher fehleranfällig sind (s. Abschnitt <a href="#6"> Wie arbeit man mit der Ausgabe von ASYST weiter? </a>).
+
+Das Programm ist in Python geschrieben; der Quellcode ist öffentlich zugänglich. Um ASYST einfacher nutzbar zu machen, wurden die Python-Skripte 
+in eine ausführbare Programmdatei umgewandelt, die in Windows 11 nutzbar ist.
+
+<h2 id=2>Welche Sprachen unterstützt ASYST?</h2>
+
+ASYST wurde für Deutsch und  <a href="https://nlpado.de/~ulrike/papers/Pado22.pdf">Englisch</a> getestet. 
+
+Das Sprachmodell, das Deutsch abdeckt, kann im Prinzip noch weitere Sprachen verarbeiten. Sie können also grundsätzlich "Deutsch" als Spracheinstellung auswählen und Daten in einer der anderen unterstützten Sprachen hochladen. Bitte prüfen Sie die Ergebnisse aber sorgfältig, es liegen keine Erfahrungen vor! (Die Sprachen sind <a href="https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models">lt. den Modellerstellern</a>: ar, bg, ca, cs, da, de, el, en, es, et, fa, fi, fr, fr-ca, gl, gu, he, hi, hr, hu, hy, id, it, ja, ka, ko, ku, lt, lv, mk, mn, mr, ms, my, nb, nl, pl, pt, pt-br, ro, ru, sk, sl, sq, sr, sv, th, tr, uk, ur, vi, zh-cn, zh-tw.)
+
+
+<h2 id=3>Wie verwendet man ASYST?</h2>
+Wir haben bei der Entwicklung von ASYST versucht, die Verwendung möglichst einfach zu machen sein.
+
+<h3 id=4>Wie müssen auszuwertende Daten formatiert sein?</h3>
+Das Programm arbeitet auf Basis Ihrer Daten im Excel-Format .xlsx (das auch von Libre Office Calc und anderen Programmen erzeugt werden kann). Eine Beispieltabelle:
+
+![table_input.png](images%2Ftable_input.png)
+
+Dabei müssen die folgende Informationen in der **richtigen Reihenfolge** und mitem **richtigen Titel** der Spalten enthalten sein:
+1) **Question**: Die gestellte Frage
+2) **referenceAnswer**: Eine korrekte Antwort / Musterlösung / Referenzantwort
+3) **studentAnswer**: Die vom Prüfling gegebene Antwort, die bewertet werden soll.
+5) (optional) **observed grade**: Hier kann die tatsächliche Bewertung durch die Lehrkraft eingetragen werden, um Kennzahlen über die Richtigkeit der Vorhersagen zu bekommen.
+
+Die Beispieltabelle finden Sie unter <a href="https://transfer.hft-stuttgart.de/gitlab/ulrike.pado/ASYST/-/blob/main/DE_Demo_Daten.xlsx">DE_Demo_Daten.xlsx</a>. Sie enthält einige Fragen und Antworten aus dem CSSAG-Korpus (Computer Science Short Answers in German) der HFT Stuttgart. Das Korpus is CC-BY-NC lizenziert.
+
+<h3 id=5>Wie führe ich das Programm unter Windows 11 aus? </h3>
+
+Zunächst muss die Datei _ASYST.exe_ <a href="https://transfer.hft-stuttgart.de/gitlab/ulrike.pado/ASYST/-/blob/main/ASYST.exe"> heruntergeladen werden</a>.
+Sobald dies geschehen ist, kann das Programm mittels Doppelklick gestartet werden.
+
+Der Start des Programmes wird eine Weile dauern (ca 1 Minute). In dieser Zeit wird das System initialisiert. 
+
+**Hinweis**: Es kann passieren, dass Windows Defender davor warnt, die Anwendung auszuführen, da das Programm kein Sicherheitszertifikat besitzt.
+Durch Auswählen von _weitere Informationen_ und anschließend _Trotzdem ausführen_ verschwindet die Fehlermeldung und ASYST kann ausgeführt werden. Der Quelltext von ASYST ist offen zugänglich, so dass Sie sich vergewissern können, dass ASYST keine Schadsoftware ist.
+
+
+<img src="images/win_def_de_1.JPG" width="450">
+<img src="images/win_def_de_2.JPG" width="450">
+
+Nachdem das Programm gestartet wurde, erscheint eine Oberfläche, auf der die Sprache der auszuwertenden Antworten ausgewählt werden kann.
+Anschließend kann über einen Klick auf das Feld "Input File" die zu verarbeitende Tabelle ausgewählt werden.
+Hierbei sollten die Daten wie oben beschrieben angeordnet sein. 
+Nach einem Klick auf das "Start"-Feld beginnt ASYST mit der Verarbeitung der Daten. Dies kann wiederum eine Weile dauern (1-2 Minuten, relativ unabhängig von der Menge der zu verarbeitenden Daten). 
+
+Sobald das Programm alle Einträge verarbeitet und Vorhersagen getroffen hat, öffnet sich eine Tabellenansicht mit der Überschrift "Results" (Ergebnisse). 
+
+Die Ergebnistabelle enthält alle Spalten der eingelesenen Tabelle, sowie zusätzlich in der Spalte "predicted grade" die von ASYST vorgeschlagene Bewertung der Antworten. Die "incorrect"-Einträge der als falsch eingestuften Antworten sind rot hinterlegt. Sie können in dieser Tabelle allerdings noch keine Bewertungen verändern. Speichern Sie hierzu über einen Klick auf "Save as" die erzeugte Tabelle und öffnen Sie sie dann mit einem Tabellenkalkulationsprogramm. 
+
+![table_results.png](images%2Ftable_results.png)
+
+Sobald die Ergebnistabelle angezeigt wird, kann ASYST die nächste Tabelle einlesen und verarbeiten. 
+
+**ACHTUNG: Die Ergebnistabelle wird nicht automatisch gespeichert.** Werden die Ergebnisse nicht gespeichert, 
+wird die Erbgebnistabelle im nächsten Durchlauf überschrieben.
+Daher sollte, um die Ergebnisse zu sichern, auf den **"Save as"**- Button geklickt und die Ausgabetabelle am gewünschten Ort gespeichert werden.
+
+
+<h2 id=6>Wie arbeitet man mit der Ausgabe von ASYST weiter?</h2>
+
+Wir empfehlen die folgende **Vorgehensweise** beim Einsatz von ASYST: 
+
+(Weitere Informationen und eine konkretes Beispiel für das Vorgehen liefert der Artikel <a href="https://nlpado.de/~ulrike/papers/Pado22.pdf">_Assessing the Practical Benefit of Automated Short-Answer Graders_</a>.)
+
+1) **Definition der Anforderungen**: Wie genau muss die Bewertung in meinem aktuellen Anwendungsfall sein?
+   <ul>
+   <li>Bei der Bewertung von Freitextfragen in eher informellen Testsituationen (keine Abschlussklausur o.ä.) unterscheiden sich auch <b>menschliche Bewertungen</b> in ca. 15% der Fälle - 0% Abweichung sind also auch für Menschen kaum erreichbar! </li>
+   <li>Wir empfehlen daher in solchen Situationen, eine Bewertungsgenauigkeit von mindestens 85% auch nach dem Einsatz von ASYST plus der menschlichen Korrektur anzustreben. </li>
+   <li>Zu Beachten ist zudem die Verteilung der Bewertungsfehler (Übermäßige Strenge/Milde)</li>
+   <li>Letztlich sollte die Verwendung des Tools den Anwender:innen eine Zeitersparnis bringen: Setzen Sie das verfügbare Budget oder eine angestrebte Mindestersparnis fest. </li>
+   </ul>
+
+2) **Sammeln von** manuell bewerteten **Testdaten:**
+
+   Um einen Eindruck von der Genauigkeit und Zuverlässigkeit des automatischen Bewerters zu bekommen, werden annotierte Testdaten benötigt,
+   d.h. Eingabe-Daten, für die eine korrekte Klassifizierung bereits festgelegt ist. Es werden also Daten im einlesbaren Format benötigt, die bereits manuell bewertet wurden. Dies können z.B. Antworten aus früheren Tests sein.
+   Um den Datensatz möglichst robust gegenüber zufälligen Schwankungen zu machen, sollte er idealerweise einige hundert Antworten umfassen -- aber kleinere Datensätze können natürlich ebenfalls verwendet werden.
+
+4) **Analyse** der Leistung der automatischen Bewertung
+   
+   Anhand der manuell bewerteten Testdaten kann nun gemessen werden, wie zuverlässig und treffsicher der Klassifizierer für die spezifischen Fragen arbeitet. Damit bekommen Sie einen Eindruck davon, wie gut die Vorhersage für Ihren eigenen Datensatz funktioniert.
+
+   Hierzu werden die Fragen und Antworten aus dem Testdatensatz von ASYST verarbeitet und anschließend die erhaltene Klassifikation mit der manuellen Bewertung abgeglichen (z.B. in einer Tabellenkalkulation wie Excel oder Libre Office Calc).
+   
+   Dabei kann der Anteil der korrekt klassifizierten Antworten im gesamten Datensatz ermittelt werden - dieser sollte 85% oder höher betragen (das entspricht einer Fehlerquote von 15% oder weniger). 
+   
+   Sie können auch für die einzelnen Bewertungen (richtig/falsch) berechnen, wie groß die Präzision für die verschiedenen Bewertungen jeweils ist. Die Präzision misst, wie viele Vorhersagen einer bestimmten Bewertung tatsächlich richtig waren, d.h. wie vertrauenswürdig die Vorhersagen des Bewerters für ein bestimmtes Label sind. So bedeutet eine Präzision von 75% für die Bewertung "korrekt", dass drei Viertel aller Vorhersagen von "korrekt" gestimmt haben, aber in einem Viertel der Fälle die Antwort laut der manuellen Bewertung falsch war.
+
+   _(Die Funktion, diese Kenngrößen der Zuverlässigkeit automatisch in einem Testmodus zu generieren soll in Zukunft dem Programm noch hinzugefügt werden.)_
+
+5) **Entscheidung** wie der Ansatz genutzt werden soll.
+   
+   Anhand der erhobenen Kenngrößen zur Zuverlässigkeit für die oben genannten Kriterien kann nun eine Entscheidung getroffen werden. 
+   <ul>
+   <li> Wie groß ist der Anteil der korrekt vorhergesagten Bewertungen? Beträgt er >85%, können Sie die ASYST-Vorhersagen sogar unverändert übernehmen, falls Sie dies wünschen. </li>
+   <li> Wie ist die Präzision der einzelnen Bewertungsklassen (richtig/falsch)? Wenn eine der Klassen deutlich zuverlässiger vorhergesagt wird, können Sie entscheiden, diese Vorhersagen ungeprüft zu übernehmen und <b>nur</b> die Vorhersagen für die weniger verlässlich erkannte Klasse zu überprüfen. Dies führt in der Praxis zu einer deutlichen Zeitersparnis. </li>
+   <li>Wie ist der Bewertungsfehler verteilt? Werden übermäßig viele korrekte Antworten als falsch bewertet, oder umgekehrt? Ist dies für Ihre Situation akzeptabel? </li>
+   <li> Wie viel Bewertungsaufwand hätten Sie für den Beispieldatensatz eingespart, z.B. indem Sie die verlässlichere Bewertungsklasse ungeprüft akzeptieren?
+   </ul>
+
+
+
+<h3 id=7>Wie kann ich ASYST ausführen, wenn ich kein Windows 11 nutze?</h3>
+Die klickbare Anwendung "ASYST.exe" eignet sich nur für die Ausführung unter Windows 11. 
+
+In anderen Betriebssystemen kann ASYST aus einer Entwicklungsumgebung heraus ausgeführt werden.
+Der ASYST-Quellcode ist ursprünglich in Python geschrieben und kann daher robust in verschiedenen Umgebungen ausgeführt werden.
+Für Anwender, die mit dem Ausführen von Python-Programmen nicht vertraut sind, wird im folgenden eine Möglichkeit näher beschrieben.
+<h4 id=8>Ausführen von ASYST in der Entwicklungsumgebung Pycharm </h4>
+<ol>
+<li>Falls noch nicht geschehen, die Entwicklungsumgebung Pycharm aus dem Internet 
+    <a href="https://www.jetbrains.com/pycharm/download/?section=mac"> herunterladen </a> und installieren. 
+    Für mehr Informationen und Problemlösung siehe 
+    <a href="https://www.jetbrains.com/help/pycharm/installation-guide.html"> Pycharm-Installationsguide</a>.</li>
+<li>Python installieren 
+
+Die Entwicklung von ASYST erfolgte in Python 3.10 - daher wird diese Version für die Ausführung empfohlen.
+Die zum Betriebssystem passende Version kann unter https://www.python.org/downloads ausgewählt und installiert werden.
+</li>
+
+<li> Den Quellcode aus Gitlab in die Entwicklungsumgebung herunterladen:
+
+Get from VCS 
+
+
+<img src="images/get_from_vcs.png" width="450">
+
+
+im Feld _url_ folgenden Pfad eintragen: git@transfer.hft-stuttgart.de:ulrike.pado/ASYST.git
+
+
+<img src="images/svn_url.png" width="450">
+
+
+Anschließend auf _clone_ klicken und warten
+</li>
+
+<li>Entwicklungsumgebung konfigurieren
+**Python-Interpreter konfigurieren:**
+
+Navigiere zu   _Settings >> Project ASYST >> Python Interpreter >> Add Interpreter >> Add local Interpreter_ 
+
+![add_interpreter.png](images%2Fadd_interpreter.png)
+
+
+![create_venv.png](images%2Fcreate_venv.png)
+
+
+_Location_: [Projektpfad]/[Projektname]/Source, 
+
+
+_Base interpreter_:  Pfad zur installierten Pythonversion
+
+
+*Benötigte Pakte installieren:*
+Falls Pycharm nicht von sich aus vorschlägt, die in der requirements.txt aufgeführten Pakete zu installieren,
+führe manuell über das Terminal von PyCharm folgende Befehle aus:
+
+'''
+> cd Source
+> 
+> 
+> pip install -r requirements.txt
+
+'''
+
+</li>
+<li>ASYST ausführen
+
+![run_button.png](images%2Frun_button.png)
+
+
+Nachdem über das Projektverzeichnis links die Datei _main.py_ ausgewählt wurde, wird der ausgegraute _Startknopf_ oben rechts
+im Fenster grün. Ein einfacher Klick genügt, und ASYST wird ausgeführt. 
+
+</li>
+</ol>
+
--- a/README_EN.md
+++ b/README_EN.md
+<h2>Content:</h2>
+<ul>  
+   <li><a href="#1"> What is ASYST? </a> </li>
+   <li><a href="#2"> Which languages are supported by ASYST? </a></li> 
+   <li><a href="#3"> How do I use ASYST? </a></li> 
+   <ol>
+      <li><a href="#4"> What does the input look like?</a> </li>
+      <li><a href="#5">  How do I run ASYST on Windows 11? </a></li>
+      
+   </ol>
+   <li><a href="#6"> How do I continue with the output from ASYST? </a></li> 
+   <li><a href="#7"> How do I run ASYST if I don't use Windows 11? </a> 
+         <ul><li><a href="#8"> Running ASYST in the Pycharm development environment</a> </li></ul></li>
+</ol></ul>
+
+
+<h2 id=1>What is ASYST?</h2>
+
+ASYST is a program designed to support teachers as they grade free-text answers in tests: With the help of Artificial Intelligence,
+ASYST makes grade suggestions that can be reviewed and, if necessary, modified by the teachers.
+
+ASYST is intended for the evaluation of short answers that are one to three sentences in long. It is not intended to be used for longer responses.
+
+ASYST helps the teacher by suggesting a grade. This assessment may well be incorrect in individual cases; the teacher can check and correct it. 
+This saves time in two ways compared to completely manual grading: First, reviewing grades is generally faster than grading from scratch; 
+and second, we recommend reviewing mostly those ASYST grades that are most prone to errors (see Section <a href="#6"> How do I continue with the output from ASYST? </a>).
+
+The program is written in Python; the source code is publicly available. To make ASYST easier to use, the Python scripts have been 
+converted into an executable that is usable in Windows 11.
+
+<h2 id=2>Which languages are supported by ASYST?</h2>
+
+ASYST has been tested for German and <a href="https://nlpado.de/~ulrike/papers/Pado22.pdf">English</a>. 
+
+The language model that covers German can in principle handle other languages, as well. So, in principle, you could select "German" as language setting and upload data in one of the other languages covered by the model. If you try this, please check the results carefully, as this is untested! (<a href="https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models">According to the model developers,</a> the covered languages are: ar, bg, ca, cs, da, de, el, en, es, et, fa, fi, fr, fr-ca, gl, gu, he, hi, hr, hu, hy, id, it, ja, ka, ko, ku, lt, lv, mk, mn, mr, ms, my, nb, nl, pl, pt, pt-br, ro, ru, sk, sl, sq, sr, sv, th, tr, uk, ur, vi, zh-cn, zh-tw.)
+
+
+<h2 id=3>How do I use ASYST?</h2>
+We developed ASYST to be as user-friendly as possible.
+
+<h3 id=4>What does the input look like?</h3>
+
+The program works based on your data in Excel's .xlsx format (which can also be generated by Libre Office Calc and other programs). This is an example table:
+
+![table_input.png](images%2Ftable_input.png)
+
+The following information needs to be included in the **correct order** and with the **correct column headings**:
+1) **question**: The question that was asked
+2) **referenceAnswer**: A correct answer /  reference answer
+3) **studentAnswer**: The student answer that is to be evaluated
+5) (optional) **observed grade**: The grade given by the teacher can be entered here in order to evaluate the accuracy of the ASYST predictions.
+
+The example table can be found at <a href="https://transfer.hft-stuttgart.de/gitlab/ulrike.pado/ASYST/-/blob/main/DE_Demo_Daten.xlsx">DE_Demo_Daten.xlsx</a>. It contains some questions and answers from the CSSAG corpus (Computer Science Short Answers in German) of HFT Stuttgart. The corpus is licensed as CC-BY-NC.
+
+<h3 id=5>How do I run ASYST on Windows 11? </h3>
+
+First, download <a href="https://transfer.hft-stuttgart.de/gitlab/ulrike.pado/ASYST/-/blob/main/ASYST.exe">_ASYST.exe_ </a>.
+The program can be started by double-clicking its icon.
+
+The program will take a while to start (approx. 1 minute). During this time the system is initialized.
+
+**Note**: Windows Defender may warn against running the application because the program does not have a security certificate.
+By selecting _more information_ and then _Run anyway_ the error message disappears and ASYST can be executed. ASYST's source code is open source so you can verify the code is not malicious.
+
+
+<img src="images/win_def_de_1.JPG" width="450">
+<img src="images/win_def_de_2.JPG" width="450">
+
+After the program has been started, a window appears. First, select the language of the answers to be evaluated.
+The table to be processed can then be selected by clicking on the “Input File” field. The data should be arranged as described above.
+After clicking on the “Start” field, ASYST begins processing the data. Again, this can take a while (1-2 minutes, relatively independent of the amount of data being processed).
+
+Once the program has processed all answers and made predictions, a table view headed "Results" opens.
+
+The results table contains all columns of the input table, as well as the grades suggested by ASYST -- see the the "predicted grade" column. The grades for answers classified as incorrect are highlighted in red. You cannot make change in this table. Instead, save the data by clicking on “Save as” and then open the resulting .xlsx file with a spreadsheet program.
+
+![table_results.png](images%2Ftable_results.png)
+
+As soon as the result table is displayed, ASYST can read and process the next input table.
+
+**ATTENTION: The results table is not saved automatically.** 
+Therefore, to save the results, the **"Save as"** button should be clicked and the output table should be saved at the desired location.
+
+
+<h2 id=6>How do I continue with the output from ASYST?</h2>
+
+We recommend the following **process** when using ASYST:
+
+(Further information and a concrete example of the procedure can be found in <a href="https://nlpado.de/~ulrike/papers/Pado22.pdf">_Assessing the Practical Benefit of Automated Short-Answer Graders_</a>.)
+
+1) **Define requirements**: How accurate does the grading need to be in my current use case?
+    <ul>
+    <li>When evaluating free text questions in low-stakes test situations (not in a final exam or similar), <b>human grades</b> differ in around 15% of cases! </li>
+    <li>In such situations, we therefore recommend aiming for a grading accuracy of at least 85% after using ASYST plus human review. </li>
+    <li>The distribution of grading errors (excessive strictness/leniency) should also be taken into account</li>
+    <li>Ultimately, using the tool should save users time: set the available time budget or a minimum requirement for time saved. </li>
+    </ul>
+
+2) **Collect** manually evaluated **test data:**
+
+    To get an idea of the accuracy and reliability of the automated grader, annotated test data is needed.
+    This is input data for which a grade has already been determined. This can be answers from previous tests, for example.
+    To make the data set as robust as possible against random fluctuations, it should ideally contain a few hundred responses -- but smaller data sets can of course also be used.
+
+4) **Analyze** the performance of the automated grading
+   
+    The manually graded test data can be used to measure how reliable and accurate the automated grader is for the test data. This will give you an idea of how well the grade prediction works for your own data set.
+
+    For this purpose, process the questions and answers from the test data set using ASYST and compare the grade predictions with the manual assessment (e.g. in a spreadsheet such as Excel or Libre Office Calc).
+
+    The proportion of correctly classified answers in the entire data set gives you the system accuracy (which should be at around 85% or higher, which means disagreement between the manual and machine grades of 15% or less). 
+    
+    You can also calculate the precision for each grade ("correct"/"incorrect"). Precision measures how many predictions of a given grade were actually correct, i.e. how trustworthy the ASYST's predictions are for a given label. A precision of 75% for the rating "correct" means that three quarters of all predictions of "correct" were in fact right, but for a quarter of the cases the answer was actually wrong according to the manual grades.
+
+    _(We plan to add functionality to automatically generate these reliability parameters in the future.)_
+
+5) **Decide** how to use ASYST's predictions.
+   
+    A usage decision can now be made based on the reliability parameters collected for the criteria mentioned above.
+    <ul>
+    <li> How large is the proportion of correctly predicted reviews? If it is >85%, you can even adopt the ASYST predictions unchanged if you wish. </li>
+    <li> What is the precision of the grade labels ("correct"/"incorrect")? If one of the grade labels is likely to be significantly more reliable, you can decide to accept these predictions without review and <b>only</b> check the predictions for the less reliable grade predictions. In practice, this leads to significant time savings. </li>
+    <li>How is the grading error distributed? Are correct answers frequently predicted to be incorrect, or vice versa? Is this acceptable for your situation? </li>
+    <li> How much evaluation effort would you have saved for the example data set, e.g. by accepting the more reliable grade label without review?
+    </ul>
+
+
+
+<h3 id=7>How can I run ASYST if I don't use Windows 11?</h3>
+The clickable application “ASYST.exe” is only suitable for running on Windows 11.
+
+On other operating systems, ASYST can be run from a development environment.
+The ASYST source code is written in Python and can therefore be robustly executed in various development environments.
+For users who are not familiar with running Python programs, one option is described in more detail below.
+<h4 id=8>Running ASYST in the Pycharm development environment </h4>
+<ol>
+<li>If you haven't already done so, the Pycharm development environment from the Internet
+     Download <a href="https://www.jetbrains.com/pycharm/download/?section=mac"> </a> and install.
+     For more information and troubleshooting see
+     <a href="https://www.jetbrains.com/help/pycharm/installation-guide.html">Pycharm installation guide</a>.</li>
+<li>Install Python
+
+ASYST was developed in Python 3.10 - therefore this version is recommended for execution.
+The version that matches the operating system can be selected and installed at https://www.python.org/downloads.
+</li>
+
+<li> Download the source code from Gitlab to the development environment:
+
+Get from VCS
+
+
+<img src="images/get_from_vcs.png" width="450">
+
+
+Enter the following path in the _url_ field: git@transfer.hft-stuttgart.de:ulrike.pado/ASYST.git
+
+
+<img src="images/svn_url.png" width="450">
+
+
+Then click on _clone_ and wait
+</li>
+
+<li>Configure development environment
+**Configure Python interpreter:**
+
+Navigate to _Settings >> Project ASYST >> Python Interpreter >> Add Interpreter >> Add local Interpreter_
+
+![add_interpreter.png](images%2Fadd_interpreter.png)
+
+
+![create_venv.png](images%2Fcreate_venv.png)
+
+
+_Location_: [Project Path]/[Project Name]/Source,
+
+
+_Base interpreter_: Path to the installed Python version
+
+
+*Install required packages:*
+If Pycharm does not itself suggest installing the packages listed in the requirements.txt,
+manually run the following commands in the PyCharm terminal:
+
+'''
+> cd Source
+> 
+> 
+> pip install -r requirements.txt
+
+'''
+
+</li>
+<li>Run ASYST
+
+![run_button.png](images%2Frun_button.png)
+
+
+After the file _main.py_ has been selected via the project directory on the left, the greyed out _Start button_ at the top right of the window will appear green. A single click is enough to execute the ASYST code.
+
+</li>
+</ol>
\ No newline at end of file
--- a/Source/LICENSE
+++ b/Source/LICENSE
+MIT License
+
+Copyright (c) 2022 Larissa Kirschner
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
--- a/Source/Skript/english/run_SAG_mnli.py
+++ b/Source/Skript/english/run_SAG_mnli.py
+# coding=utf-8
+
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code is originally from https://github.com/nlpyang/pytorch-transformers/blob/master/examples/run_glue.py
+# Adapted to the SAG task by Ulrike Pado, HFT Stuttgart: Run a fine-tuned model on given input data to predict short-answer grades.
+
+
+
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import os
+import random
+import sys
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
+
+from sklearn.metrics import f1_score, accuracy_score
+
+from transformers import (
+    BertConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
+)
+
+from transformers import glue_compute_metrics as compute_metrics
+from transformers import (
+    glue_convert_examples_to_features as convert_examples_to_features,
+)
+from transformers.data.processors.utils import (
+    DataProcessor,
+    InputExample,
+)
+
+#logger = logging.getLogger(__name__)
+
+MODEL_CLASSES = {
+    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
+}
+
+
+def set_seed():
+    random.seed(42)
+    np.random.seed(42)
+    torch.manual_seed(42)
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    # Loop to handle MNLI double evaluation (matched, mis-matched)
+    # and SemEval evaluation (unseen questions, unseen answers, unseen domains)
+    eval_task_names = ("sag",
+                       )
+
+    eval_outputs_dirs = (
+        (args.output_dir, )
+    )
+
+    results = {}
+    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
+
+        eval_dataset = load_and_cache_examples(
+            args, eval_task, tokenizer
+        )
+
+        if not os.path.exists(eval_output_dir):
+            os.makedirs(eval_output_dir)
+
+        args.eval_batch_size = 8
+
+        # Note that DistributedSampler samples randomly
+        eval_sampler = SequentialSampler(eval_dataset)
+        eval_dataloader = DataLoader(
+            eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size
+        )
+
+        # Eval!
+        #logger.info("***** Running evaluation {} *****".format(prefix))
+        #logger.info("  Task name = {}".format(eval_task))
+        #logger.info("  Num examples = %d", len(eval_dataset))
+        #logger.info("  Batch size = %d", args.eval_batch_size)
+
+        eval_loss = 0.0
+        nb_eval_steps = 0
+        preds = None
+        out_label_ids = None
+
+        for batch in eval_dataloader:
+            #logger.info(" Starting eval for batch")
+            model.eval()
+            batch = tuple(t.to(args.device) for t in batch)
+            #logger.info("  Batch converted to tuple")
+
+            with torch.no_grad():
+                inputs = {
+                    "input_ids": batch[0],
+                    "attention_mask": batch[1],
+                    "token_type_ids": batch[2],
+                    "labels": batch[3],
+                }
+
+                outputs = model(**inputs)
+                tmp_eval_loss, logits = outputs[:2]
+
+                eval_loss += tmp_eval_loss.mean().item()
+                #logger.info("Eval loss: %d", eval_loss)
+            nb_eval_steps += 1
+            if preds is None:
+                preds = logits.detach().cpu().numpy()
+                out_label_ids = inputs["labels"].detach().cpu().numpy()
+            else:
+                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(
+                    out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0
+                )
+
+        #logger.info("Prediction generation done")
+
+       # classification task; choose maximum label
+        preds = np.argmax(preds, axis=1)
+
+        # if evaluating SAG, return both accuracy and F1
+        task = "sag"
+        # logger.info("starting to compute metrics")
+        result = my_compute_metrics(task, preds, out_label_ids)
+        results.update(result)
+
+        # print predictions made by the current model
+        if args.do_print_predictions:
+            print_predictions(args, preds)
+
+        output_eval_file = os.path.join(
+            eval_output_dir, prefix + "-eval_results.txt")
+
+        #logger.info("sending output to "+str(output_eval_file));
+
+        with open(output_eval_file, "w") as writer:
+            #logger.info("***** Eval results {} *****".format(prefix))
+            for key in sorted(result.keys()):
+                #logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+    return results
+
+
+def load_and_cache_examples(args, task, tokenizer):
+    examples = []
+
+    # choose the correct processor to read the data
+
+    processor = (
+        SemEvalProcessor()
+    )
+    output_mode = "classification"
+
+    #logger.info("Creating features from dataset file at %s", args.data_dir)
+    label_list = processor.get_labels()
+
+    examples = (
+        processor.get_test_examples(args.data_dir)
+    )
+
+    # We are continuing to train mnli models, so task = mnli to create
+    # the correct type of features
+    feature_task = "mnli" if task.startswith("sag") else task
+
+    features = convert_examples_to_features(
+        examples,
+        tokenizer,
+        label_list=label_list,
+        max_length=args.max_seq_length,
+        output_mode=output_mode,
+        task=feature_task
+    )
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor(
+        [f.input_ids for f in features], dtype=torch.long)
+    all_attention_mask = torch.tensor(
+        [f.attention_mask for f in features], dtype=torch.long
+    )
+    
+    all_token_type_ids = torch.tensor(
+        [f.token_type_ids for f in features], dtype=torch.long
+    )
+    
+    # do classification setup
+    all_labels = torch.tensor(
+        [f.label for f in features], dtype=torch.long)
+
+    dataset = TensorDataset(
+        all_input_ids, all_attention_mask, all_token_type_ids, all_labels
+    )
+    return dataset
+
+
+def main():
+
+    # Where are we?
+    location=".";
+    if getattr(sys, 'frozen', False):
+        # running in a bundle
+        location = sys._MEIPASS
+
+    # open a log file next to the executable with line buffering
+    #out = open("log.txt", "a", buffering=1);
+
+    #print("Started English processing in", location, file=out);
+
+    parser = argparse.ArgumentParser()
+
+    # Required parameters - adapt to current directory
+    parser.add_argument(
+        "--data_dir",
+        # default=None,
+        default=location+"\\Skript\\outputs\\",
+        type=str,
+        # required=True,
+        required=False,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        # default=None,
+        default="bert",
+        type=str,
+        # required=True,
+        required=False,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        # default=None,
+        #default= "textattack/bert-base-uncased-MNLI",
+        default=location+"\\Skript\\english\\seb-bert-mnli",
+        type=str,
+        # required=True,
+        required=False,
+        help="Path to pre-trained model",
+    )
+
+    parser.add_argument(
+        "--tokenizer_name",
+        default="textattack/bert-base-uncased-MNLI",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+
+    parser.add_argument(
+        "--output_dir",
+        # default=None,
+        default=location+"\\Skript\\english\\seb-bert-mnli",
+        type=str,
+        # required=True,
+        required=False,
+        help="The output directory where checkpoints will be written.",
+    )
+
+    parser.add_argument(
+        "--config_name",
+        default=location+"\\Skript\\english\\seb-bert-mnli\\config.json",
+        type=str,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        # default=128,
+        default=256,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument(
+        # "--do_test", action="store_true", help="Whether to run eval on the test set."
+        "--do_test", action="store_false", help="Whether to run eval on the test set."
+    ),
+    parser.add_argument(
+        #"--do_print_predictions",action="store_true",help="Whether to print the model predictions for manual inspection.",
+        "--do_print_predictions",
+        action="store_false",
+        help="Whether to print the model predictions for manual inspection.",
+    ),
+    parser.add_argument(
+        "--do_lower_case",
+        # action="store_true",
+        action="store_false",
+        help="Set this flag if you are using an uncased model.",
+    )
+
+    parser.add_argument(
+        "--overwrite_output_dir",
+        # action="store_true",
+        action="store_false",
+        help="Overwrite the content of the output directory",
+    )
+
+    args = parser.parse_args()
+
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
+
+
+
+    # Setup CPU processing
+    
+    device = torch.device("cpu")
+    args.device = device
+
+    # Setup logging
+    #logging.basicConfig(
+    #    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+    #    datefmt="%m/%d/%Y %H:%M:%S",
+    #    filename='log.txt',
+    #    filemode='a',
+    #    level=logging.INFO,
+    #)
+
+    #logger.warning(
+    #    "Device: %s",
+    #    device
+    #)
+
+    # Set seed to 42
+    set_seed()
+
+    processor = (
+        SemEvalProcessor()
+    )
+    args.output_mode = (
+        "classification"
+    )
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    args.model_type = args.model_type.lower()
+    #logger.info("Model %s", args.model_type)
+
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else
+            args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+
+    model.to(args.device)
+
+    #logger.info("Training/evaluation parameters %s", args)
+
+    # Evaluation
+    results = {}
+    if args.do_test:
+        tokenizer = tokenizer_class.from_pretrained(
+            args.tokenizer_name if args.tokenizer_name else
+                args.model_name_or_path,
+            do_lower_case=args.do_lower_case,
+        )
+        checkpoints = [args.output_dir]
+        #logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split(
+                "-")[-1] if len(checkpoints) > 1 else ""
+            
+            prefix = str(global_step)
+
+            model = model_class.from_pretrained(checkpoint)
+
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, prefix=prefix)
+            result = dict((k + "_{}".format(global_step), v)
+                          for k, v in result.items())
+            results.update(result)
+    else:  # use currently active model
+        result = evaluate(args, model, tokenizer, prefix="test")
+        #results.update(result)
+    return results
+
+
+# define a new data processor for the SemEval data/SAG task
+
+
+class SemEvalProcessor(DataProcessor):
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train"
+        )
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev"
+        )
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test"
+        )
+
+    def get_labels(self):
+        """See base class."""
+        return ["correct", "incorrect", "NONE"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the test set."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = line[0]
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a,
+                             text_b=text_b, label=label)
+            )
+            
+        return examples
+
+
+# custom metrics for SAG: F1 and Accuracy
+
+
+def my_compute_metrics(eval_task, preds, labels):
+    result = {}
+    if eval_task == "sag":
+        acc = accuracy_score(y_pred=preds, y_true=labels)
+        f1_weighted = f1_score(y_pred=preds, y_true=labels, average="weighted")
+        f1_macro = f1_score(y_pred=preds, y_true=labels, average="macro")
+        result = {"f1-weighted": f1_weighted,
+                  "f1-macro": f1_macro, "accuracy": acc}
+    else:
+        result = compute_metrics(eval_task, preds, labels)
+    return result
+
+
+def print_predictions(args, preds):
+    # generate data set part of output path
+    dir_name = (""
+                )
+
+    # get examples
+    processor = (
+        SemEvalProcessor()
+    )
+
+    examples = (
+        processor.get_test_examples(args.data_dir)
+    )
+
+    with open(args.data_dir + "/" + dir_name + "/predictions.txt", "w", encoding="utf8") as writer:
+        # print("# examples: " + str(len(examples)))
+        # print("# labels: " + str(len(labels)))
+        # print("# preds: " + str(len(preds)))
+
+        writer.write(
+            "question\treferenceAnswer\tstudentAnswer\tsuggested grade\tobserved grade\n")
+        for i in range(len(examples)):
+
+            # iterate over data
+            # print prediction as a text-based label
+            hrpred = "incorrect"
+            if preds[i] == 0:
+                hrpred = "correct"
+
+            # get guid, text, from inputExample
+            writer.write(
+                str(examples[i].guid)
+                + "\t"
+                + examples[i].text_a
+                + "\t"
+                + examples[i].text_b
+                + "\t"
+                + hrpred
+                + "\t"
+                + examples[i].label
+                + "\n"
+            )
+            # else: print("Labels don't match! "+str(i)+": "+str(examples[i].label)+" "+str(labels[i]))
+
+
+if __name__ == "__main__":
+    main()
--- a/Source/Skript/english/seb-bert-mnli/config.json
+++ b/Source/Skript/english/seb-bert-mnli/config.json
+{
+  "_name_or_path": "textattack/bert-base-uncased-MNLI",
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "finetuning_task": "sag-seb",
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.2.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}
--- a/Source/Skript/english/seb-bert-mnli/pytorch_model.bin
+++ b/Source/Skript/english/seb-bert-mnli/pytorch_model.bin
--- a/Source/Skript/german/LICENSE
+++ b/Source/Skript/german/LICENSE
+MIT License
+
+Copyright (c) 2022 Yunus Eryilmaz
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
--- a/Source/Skript/german/models/LICENSE
+++ b/Source/Skript/german/models/LICENSE
+MIT License
+
+Copyright (c) 2022 Yunus Eryilmaz
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
--- a/Source/Skript/german/models/clf_BERT.pickle
+++ b/Source/Skript/german/models/clf_BERT.pickle
--- a/Source/Skript/german/run_LR_SBERT.py
+++ b/Source/Skript/german/run_LR_SBERT.py
+import os
+import sys
+import time
+import numpy as np
+import pandas as pd
+
+# UP
+import pickle
+import argparse
+
+from sklearn import metrics
+from sentence_transformers import models, SentenceTransformer
+from sklearn.linear_model import LogisticRegression, Perceptron
+from sklearn.metrics import f1_score
+from sklearn.metrics import classification_report
+from sklearn.model_selection import cross_validate, cross_val_predict
+
+__author__ = "Yunus Eryilmaz"
+__version__ = "1.0"
+__date__ = "21.07.2021"
+__source__ = "https://pypi.org/project/sentence-transformers/0.3.0/"
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    # Where are we?
+    location = ".";
+    if getattr(sys, 'frozen', False):
+        # running in a bundle
+        location = sys._MEIPASS
+
+    # Required parameters
+    parser.add_argument(
+        "--data",
+        # default=None,
+        default=location + "\\Skript\\outputs\\test.tsv",
+        type=str,
+        # required=True,
+        required=False,
+        help="The input data file for the task.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        # default=None,
+        default=location + "\\Skript\\outputs\\",
+        type=str,
+        # required=True,
+        required=False,
+        help="The output directory where predictions will be written.",
+    )
+    parser.add_argument(
+        "--model_dir",
+        # default=None,
+        default=location + "\\Skript\\german\\models",
+        type=str,
+        # required=True,
+        required=False,
+        help="The directory where the ML models are stored.",
+    )
+    args = parser.parse_args()
+
+    # open a log file next to the executable with line buffering
+    # out = open("log.txt", "a",buffering=1);
+
+    # print("Started German processing in",location,file=out);
+
+    # import SentenceTransformer-model
+    start_time = time.time()
+
+    # print("Reading from",args.data, file=out);
+
+    with open(os.path.join(location, args.data)) as ft:
+        dft = pd.read_csv(ft, delimiter='\t')
+
+    # Sentences we want sentence embeddings for
+    sentences1_test = dft['referenceAnswer'].values.tolist()
+    sentences2_test = dft['studentAnswer'].values.tolist()
+    # print("Input read:",sentences2_test, file=out);
+
+    # print(sentences1_test)
+
+    # Use BERT for mapping tokens to embeddings
+    word_embedding_model = models.Transformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
+    # pooling operation can choose by setting true (Apply mean pooling to get one fixed sized sentence vector)
+    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
+                                   pooling_mode_mean_tokens=True,
+                                   pooling_mode_cls_token=False,
+                                   pooling_mode_max_tokens=False)
+
+    # compute the sentence embeddings for both sentences
+    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+    # print("Model loaded", file=out);
+
+    sentence_embeddings1_test = model.encode(sentences1_test, convert_to_tensor=True, show_progress_bar=False)
+    # print("Embeddings RefA:",sentence_embeddings1_test,file=out);
+
+    sentence_embeddings2_test = model.encode(sentences2_test, convert_to_tensor=True, show_progress_bar=False)
+
+    # print("Embeddings found", file=out);
+
+    # Possible concatenations from the embedded sentences can be selected
+    def similarity(sentence_embeddings1, sentence_embeddings2):
+        # I2=(|u − v| + u ∗ v)
+        simi = abs(np.subtract(sentence_embeddings1, sentence_embeddings2)) + np.multiply(sentence_embeddings1,
+                                                                                          sentence_embeddings2)
+
+        return simi
+
+    # calls the similarity function and get the concatenated values between the sentence embeddings
+    computed_simis_test = similarity(sentence_embeddings1_test, sentence_embeddings2_test)
+
+    # get the sentence embeddings and the labels fpr train and test
+
+    X_test = computed_simis_test
+    # Y_test = np.array(dft['label'])
+
+    # UP: read pre-trained LR model
+    clf_log = pickle.load(open(args.model_dir + "\\clf_BERT.pickle", "rb"))
+
+    # print('--------Evaluate on Testset------- ', file=out)
+    predictions = clf_log.predict(X_test)
+
+    # new code inserted here
+
+    count = 0
+
+    # observed grade list created
+    observed_grade_column = dft['observed grade']
+    obs_grade = observed_grade_column.tolist()
+
+    # suggested grade list created
+    temp_sugg_grade = predictions
+    sugg_grade = ['correct' if pred == 1 else 'incorrect' for pred in temp_sugg_grade]
+
+    # Check if obs_grade contains "NONE" values or is empty
+    if not obs_grade or all(grade == 'NONE' for grade in obs_grade):
+
+        # print("obs_grade is empty or contains 'NONE' values. Skipping classification report.")
+        count += 1
+
+    else:
+        # classification report
+        classification_rep = classification_report(obs_grade, sugg_grade)
+
+        report_string = classification_rep
+
+        report_lines = report_string.split('\n')
+
+        # print(report_lines)
+
+        # accuracy line
+        formatted_accuracy_line = "\t".join(report_lines[5].split())
+        formatted_acc_line_with_tabs = (formatted_accuracy_line[:formatted_accuracy_line.index('\t',
+                                                                                               formatted_accuracy_line.index(
+                                                                                                   '\t'))] + '\t\t' +
+                                        formatted_accuracy_line[
+                                        formatted_accuracy_line.index('\t', formatted_accuracy_line.index('\t')):])
+
+        # #weighted avg printing
+        #
+        wt_avg_line = "\t".join(report_lines[7].split())
+        # print(wt_avg_line)
+
+        new_wt_avg_line = wt_avg_line.replace("\t", " ", 1)
+        # print(new_wt_avg_line)
+
+        #
+        # wt_avg_line = report_lines[7].split()
+        #
+        # #wt_avg_line
+        #
+        # wg_element_1 = wt_avg_line[0]
+        # wg_element_2 = wt_avg_line[1]
+        #
+        # print(wg_element_1)
+        # print(wg_element_2)
+        #
+        # new_wt_line_out_1_2 =
+
+        # formatted_wt_with_tabs = (wt_avg_line[:wt_avg_line.index('\t',
+        #     wt_avg_line.index('\t')) +1 ] + '\t' +
+        #     wt_avg_line[wt_avg_line.index('\t', wt_avg_line.index('\t')):])
+
+        # Join the entire newly formatted list into a single string
+        formatted_output = "\n".join([
+            "\t precision \t recall \t f1-score \t support",
+            "\t".join(report_lines[2].split()),
+            "\t".join(report_lines[3].split()),
+            formatted_acc_line_with_tabs,
+            new_wt_avg_line
+        ])
+
+        # Print the entire formatted output
+        # print("\nFormatted Output:")
+        # print(formatted_output)
+
+    # UP print results
+    with open(args.output_dir + "\\predictions.txt", "w") as writer:
+        writer.write("question\treferenceAnswer\tstudentAnswer\tsuggested_grade\tobserved_grade\n")
+        for i in range(len(dft)):
+            hrpred = "incorrect"
+            if predictions[i] == 1:
+                hrpred = "correct"
+            writer.write(
+                str(dft.iloc[i][0])
+                + "\t"
+                + str(dft.iloc[i][1])
+                + "\t"
+                + str(dft.iloc[i][2])
+                + "\t"
+                + str(hrpred)
+                + "\t"
+                + str(dft.iloc[i][3])
+                + "\n"
+            )
+
+        if count == 1:
+            writer.write("\nClassification Report cannot pe printed as observed grade column is empty or filled "
+                         "with 'NONE' or 'none' values\n")
+        else:
+
+            # Write the classification report to the file
+
+            writer.write("\nClassification Report:\n")
+            writer.write(formatted_output)
+
+    # print('\nExecution time:', time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)),file=out)
+
+
+if __name__ == "__main__":
+    main()
--- a/Source/Skript/outputs/.gitkeep
+++ b/Source/Skript/outputs/.gitkeep