-Cleanup: 1. Removed 'english' folder and remane the main folder name to...

-Cleanup: 1. Removed 'english' folder and remane the main folder name to 'lang'. 2. Created new executable

-Cleanup: 1. Removed 'english' folder and remane the main folder name to...
-Cleanup: 1. Removed 'english' folder and remane the main folder name to 'lang'. 2. Created new executable
dab032d8 · Kaif Siddique · 4c36806b · 4c36806b · 4c36806b · 4c36806b
Commit dab032d8 authored 2 months ago by Kaif Siddique
Hide whitespace changes
Inline Side-by-side

Showing

with 4 additions and 619 deletions
+4 -619
--- a/Source/Skript/english/run_SAG_mnli.py
+++ b/Source/Skript/english/run_SAG_mnli.py
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Code is originally from https://github.com/nlpyang/pytorch-transformers/blob/master/examples/run_glue.py
-# Adapted to the SAG task by Ulrike Pado, HFT Stuttgart: Run a fine-tuned model on given input data to predict short-answer grades.
-from __future__ import absolute_import, division, print_function
-import argparse
-import os
-import random
-import sys
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
-from sklearn.metrics import f1_score, accuracy_score
-from sklearn.metrics import classification_report
-from transformers import (
-    BertConfig,
-    BertForSequenceClassification,
-    BertTokenizer,
-)
-from transformers import glue_compute_metrics as compute_metrics
-from transformers import (
-    glue_convert_examples_to_features as convert_examples_to_features,
-)
-from transformers.data.processors.utils import (
-    DataProcessor,
-    InputExample,
-)
-#logger = logging.getLogger(__name__)
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
-}
-def set_seed():
-    random.seed(42)
-    np.random.seed(42)
-    torch.manual_seed(42)
-def evaluate(args, model, tokenizer, prefix=""):
-    # Loop to handle MNLI double evaluation (matched, mis-matched)
-    # and SemEval evaluation (unseen questions, unseen answers, unseen domains)
-    eval_task_names = ("sag",
-                       )
-    eval_outputs_dirs = (
-        (args.output_dir, )
-    )
-    results = {}
-    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
-        eval_dataset = load_and_cache_examples(
-            args, eval_task, tokenizer
-        )
-        if not os.path.exists(eval_output_dir):
-            os.makedirs(eval_output_dir)
-        args.eval_batch_size = 8
-        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(eval_dataset)
-        eval_dataloader = DataLoader(
-            eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size
-        )
-        # Eval!
-        #logger.info("***** Running evaluation {} *****".format(prefix))
-        #logger.info("  Task name = {}".format(eval_task))
-        #logger.info("  Num examples = %d", len(eval_dataset))
-        #logger.info("  Batch size = %d", args.eval_batch_size)
-        eval_loss = 0.0
-        nb_eval_steps = 0
-        preds = None
-        out_label_ids = None
-        for batch in eval_dataloader:
-            #logger.info(" Starting eval for batch")
-            model.eval()
-            batch = tuple(t.to(args.device) for t in batch)
-            #logger.info("  Batch converted to tuple")
-            with torch.no_grad():
-                inputs = {
-                    "input_ids": batch[0],
-                    "attention_mask": batch[1],
-                    "token_type_ids": batch[2],
-                    "labels": batch[3],
-                }
-                outputs = model(**inputs)
-                tmp_eval_loss, logits = outputs[:2]
-                eval_loss += tmp_eval_loss.mean().item()
-                #logger.info("Eval loss: %d", eval_loss)
-            nb_eval_steps += 1
-            if preds is None:
-                preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs["labels"].detach().cpu().numpy()
-            else:
-                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(
-                    out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0
-                )
-        #logger.info("Prediction generation done")
-       # classification task; choose maximum label
-        preds = np.argmax(preds, axis=1)
-        # if evaluating SAG, return both accuracy and F1
-        task = "sag"
-        # logger.info("starting to compute metrics")
-        result = my_compute_metrics(task, preds, out_label_ids)
-        results.update(result)
-        # print predictions made by the current model
-        if args.do_print_predictions:
-            print_predictions(args, preds)
-        output_eval_file = os.path.join(
-            eval_output_dir, prefix + "-eval_results.txt")
-        #logger.info("sending output to "+str(output_eval_file));
-        with open(output_eval_file, "w") as writer:
-            #logger.info("***** Eval results {} *****".format(prefix))
-            for key in sorted(result.keys()):
-                #logger.info("  %s = %s", key, str(result[key]))
-                writer.write("%s = %s\n" % (key, str(result[key])))
-    return results
-def load_and_cache_examples(args, task, tokenizer):
-    examples = []
-    # choose the correct processor to read the data
-    processor = (
-        SemEvalProcessor()
-    )
-    output_mode = "classification"
-    #logger.info("Creating features from dataset file at %s", args.data_dir)
-    label_list = processor.get_labels()
-    examples = (
-        processor.get_test_examples(args.data_dir)
-    )
-    # We are continuing to train mnli models, so task = mnli to create
-    # the correct type of features
-    feature_task = "mnli" if task.startswith("sag") else task
-    features = convert_examples_to_features(
-        examples,
-        tokenizer,
-        label_list=label_list,
-        max_length=args.max_seq_length,
-        output_mode=output_mode,
-        task=feature_task
-    )
-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor(
-        [f.input_ids for f in features], dtype=torch.long)
-    all_attention_mask = torch.tensor(
-        [f.attention_mask for f in features], dtype=torch.long
-    )
-    all_token_type_ids = torch.tensor(
-        [f.token_type_ids for f in features], dtype=torch.long
-    )
-    # do classification setup
-    all_labels = torch.tensor(
-        [f.label for f in features], dtype=torch.long)
-    dataset = TensorDataset(
-        all_input_ids, all_attention_mask, all_token_type_ids, all_labels
-    )
-    return dataset
-def main():
-    # Where are we?
-    location=".";
-    if getattr(sys, 'frozen', False):
-        # running in a bundle
-        location = sys._MEIPASS
-    # open a log file next to the executable with line buffering
-    #out = open("log.txt", "a", buffering=1);
-    #print("Started English processing in", location, file=out);
-    parser = argparse.ArgumentParser()
-    # Required parameters - adapt to current directory
-    parser.add_argument(
-        "--data_dir",
-        # default=None,
-        default=location+"\\Skript\\outputs\\",
-        type=str,
-        # required=True,
-        required=False,
-        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
-    )
-    parser.add_argument(
-        "--model_type",
-        # default=None,
-        default="bert",
-        type=str,
-        # required=True,
-        required=False,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        # default=None,
-        #default= "textattack/bert-base-uncased-MNLI",
-        default=location+"\\Skript\\english\\seb-bert-mnli",
-        type=str,
-        # required=True,
-        required=False,
-        help="Path to pre-trained model",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="textattack/bert-base-uncased-MNLI",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--output_dir",
-        # default=None,
-        default=location+"\\Skript\\english\\seb-bert-mnli",
-        type=str,
-        # required=True,
-        required=False,
-        help="The output directory where checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--config_name",
-        default=location+"\\Skript\\english\\seb-bert-mnli\\config.json",
-        type=str,
-        help="Pretrained config name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        # default=128,
-        default=256,
-        type=int,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
-        "than this will be truncated, sequences shorter will be padded.",
-    )
-    parser.add_argument(
-        # "--do_test", action="store_true", help="Whether to run eval on the test set."
-        "--do_test", action="store_false", help="Whether to run eval on the test set."
-    ),
-    parser.add_argument(
-        #"--do_print_predictions",action="store_true",help="Whether to print the model predictions for manual inspection.",
-        "--do_print_predictions",
-        action="store_false",
-        help="Whether to print the model predictions for manual inspection.",
-    ),
-    parser.add_argument(
-        "--do_lower_case",
-        # action="store_true",
-        action="store_false",
-        help="Set this flag if you are using an uncased model.",
-    )
-    parser.add_argument(
-        "--overwrite_output_dir",
-        # action="store_true",
-        action="store_false",
-        help="Overwrite the content of the output directory",
-    )
-    args = parser.parse_args()
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-    # Setup CPU processing
-    device = torch.device("cpu")
-    args.device = device
-    # Setup logging
-    #logging.basicConfig(
-    #    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-    #    datefmt="%m/%d/%Y %H:%M:%S",
-    #    filename='log.txt',
-    #    filemode='a',
-    #    level=logging.INFO,
-    #)
-    #logger.warning(
-    #    "Device: %s",
-    #    device
-    #)
-    # Set seed to 42
-    set_seed()
-    processor = (
-        SemEvalProcessor()
-    )
-    args.output_mode = (
-        "classification"
-    )
-    label_list = processor.get_labels()
-    num_labels = len(label_list)
-    args.model_type = args.model_type.lower()
-    #logger.info("Model %s", args.model_type)
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        num_labels=num_labels,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else
-            args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    model = model_class.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    model.to(args.device)
-    #logger.info("Training/evaluation parameters %s", args)
-    # Evaluation
-    results = {}
-    if args.do_test:
-        tokenizer = tokenizer_class.from_pretrained(
-            args.tokenizer_name if args.tokenizer_name else
-                args.model_name_or_path,
-            do_lower_case=args.do_lower_case,
-        )
-        checkpoints = [args.output_dir]
-        #logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split(
-                "-")[-1] if len(checkpoints) > 1 else ""
-            prefix = str(global_step)
-            model = model_class.from_pretrained(checkpoint)
-            model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + "_{}".format(global_step), v)
-                          for k, v in result.items())
-            results.update(result)
-    else:  # use currently active model
-        result = evaluate(args, model, tokenizer, prefix="test")
-        #results.update(result)
-    return results
-# define a new data processor for the SemEval data/SAG task
-class SemEvalProcessor(DataProcessor):
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train"
-        )
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev"
-        )
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test"
-        )
-    def get_labels(self):
-        """See base class."""
-        return ["correct", "incorrect", "NONE"]
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the test set."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = line[0]
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a,
-                             text_b=text_b, label=label)
-            )
-        return examples
-# custom metrics for SAG: F1 and Accuracy
-def my_compute_metrics(eval_task, preds, labels):
-    result = {}
-    if eval_task == "sag":
-        acc = accuracy_score(y_pred=preds, y_true=labels)
-        f1_weighted = f1_score(y_pred=preds, y_true=labels, average="weighted")
-        f1_macro = f1_score(y_pred=preds, y_true=labels, average="macro")
-        result = {"f1-weighted": f1_weighted,
-                  "f1-macro": f1_macro, "accuracy": acc}
-    else:
-        result = compute_metrics(eval_task, preds, labels)
-    return result
-def print_predictions(args, preds):
-    # generate data set part of output path
-    dir_name = (""
-                )
-    # get examples
-    processor = (
-        SemEvalProcessor()
-    )
-    examples = (
-        processor.get_test_examples(args.data_dir)
-    )
-    # observed grade list created
-    obs_grade = [ex.label for ex in examples]
-    # suggested grade list created
-    sugg_grade = ['correct' if pred == 0 else 'incorrect' for pred in preds]
-    # flag: do observed grades exist?
-    count=0
-    # Check if obs_grade contains "NONE" values or is empty
-    if not obs_grade or all(grade == 'NONE' for grade in obs_grade):
-        count += 1
-    else:
-        # classification report
-        classification_rep = classification_report(obs_grade, sugg_grade)
-        report_string = classification_rep
-        report_lines = report_string.split('\n')
-        # print(report_lines)
-        # accuracy line
-        formatted_accuracy_line = "\t".join(report_lines[5].split())
-        formatted_acc_line_with_tabs = (formatted_accuracy_line[:formatted_accuracy_line.index('\t',
-                                                                                               formatted_accuracy_line.index(
-                                                                                                   '\t'))] + '\t\t' +
-                                        formatted_accuracy_line[
-                                        formatted_accuracy_line.index('\t', formatted_accuracy_line.index('\t')):])
-        # #weighted avg printing
-        #
-        wt_avg_line = "\t".join(report_lines[7].split())
-        new_wt_avg_line = wt_avg_line.replace("\t", " ", 1)
-        # Join the entire newly formatted list into a single string
-        formatted_output = "\n".join([
-            "\t precision \t recall \t f1-score \t support",
-            "\t".join(report_lines[2].split()),
-            "\t".join(report_lines[3].split()),
-            formatted_acc_line_with_tabs,
-            new_wt_avg_line
-        ])
-    with open(args.data_dir + "/" + dir_name + "/predictions.txt", "w", encoding="utf8") as writer:
-        writer.write(
-            "question\treferenceAnswer\tstudentAnswer\tsuggested grade\tobserved grade\n")
-        for i in range(len(examples)):
-            # iterate over data
-            # print prediction as a text-based label
-            hrpred = "incorrect"
-            if preds[i] == 0:
-                hrpred = "correct"
-            # get guid, text, from inputExample
-            writer.write(
-                str(examples[i].guid)
-                + "\t"
-                + examples[i].text_a
-                + "\t"
-                + examples[i].text_b
-                + "\t"
-                + hrpred
-                + "\t"
-                + examples[i].label
-                + "\n"
-            )
-        if count == 1:
-            writer.write("\nClassification Report cannot be printed as observed grade column is empty or filled "
-                             "with 'NONE' or 'none' values\n")
-        else:
-                # Write the classification report to the file
-            writer.write(
-                    "\nClassification Report - high Precision for classes correct or incorrect indicates that the class prediction is reliable:\n")
-            writer.write(formatted_output)
-if __name__ == "__main__":
-    main()
--- a/Source/Skript/english/seb-bert-mnli/config.json
+++ b/Source/Skript/english/seb-bert-mnli/config.json
-{
-  "_name_or_path": "textattack/bert-base-uncased-MNLI",
-  "architectures": [
-    "BertForSequenceClassification"
-  ],
-  "attention_probs_dropout_prob": 0.1,
-  "finetuning_task": "sag-seb",
-  "gradient_checkpointing": false,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 768,
-  "id2label": {
-    "0": "LABEL_0",
-    "1": "LABEL_1",
-    "2": "LABEL_2"
-  },
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "label2id": {
-    "LABEL_0": 0,
-    "LABEL_1": 1,
-    "LABEL_2": 2
-  },
-  "layer_norm_eps": 1e-12,
-  "max_position_embeddings": 512,
-  "model_type": "bert",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "pad_token_id": 0,
-  "position_embedding_type": "absolute",
-  "transformers_version": "4.2.2",
-  "type_vocab_size": 2,
-  "use_cache": true,
-  "vocab_size": 30522
-}
--- a/Source/Skript/english/seb-bert-mnli/pytorch_model.bin
+++ b/Source/Skript/english/seb-bert-mnli/pytorch_model.bin
--- a/Source/Skript/german/models/clf_BERT.pickle
+++ b/Source/Skript/german/models/clf_BERT.pickle
--- a/Source/Skript/german/LICENSE
+++ b/Source/Skript/german/LICENSE
--- a/Source/Skript/german/models/LICENSE
+++ b/Source/Skript/german/models/LICENSE
--- a/Source/Skript/german/models/clf_BERT_SAF_DEEN_MiniLM.pickle
+++ b/Source/Skript/german/models/clf_BERT_SAF_DEEN_MiniLM.pickle
--- a/Source/Skript/german/run_LR_SBERT.py
+++ b/Source/Skript/german/run_LR_SBERT.py
@@ -52,7 +52,7 @@ def main():
    parser.add_argument(
        "--model_dir",
        # default=None,
-        default=location + "\\Skript\\german\\models",
+        default=location + "\\Skript\\lang\\models",
        type=str,
        # required=True,
        required=False,

--- a/Source/main.py
+++ b/Source/main.py
@@ -14,7 +14,7 @@ import tkinter
 import pandas as pd
 import sys
 import os
-from Skript.german.run_LR_SBERT import main as backend
+from Skript.lang.run_LR_SBERT import main as backend
 class App(ttk.Frame):

--- a/Source/main.spec
+++ b/Source/main.spec
@@ -37,8 +37,8 @@ datas +=[('azure.tcl', '.'),
 ('.\\Skript','./Skript')]
 a = Analysis(
-#    ['.\\main.py','.\\Skript\\german\\run_LR_SBERT.py','.\\Skript\\english\\run_SAG_mnli.py', ],
+#    ['.\\main.py','.\\Skript\\lang\\run_LR_SBERT.py','.\\Skript\\english\\run_SAG_mnli.py', ],
-    ['.\\main.py','.\\Skript\\german\\run_LR_SBERT.py', ],
+    ['.\\main.py','.\\Skript\\lang\\run_LR_SBERT.py', ],
 # Hier den Pfad zu den site-packages (entweder des environments oder der zentralen Installation) angeben
    pathex=[
    '..\\venv\\Lib\\site-packages\\',