run_SAG_mnli.py

# coding=utf-8

# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Code is originally from https://github.com/nlpyang/pytorch-transformers/blob/master/examples/run_glue.py
# Adapted to the SAG task by Ulrike Pado, HFT Stuttgart: Run a fine-tuned model on given input data to predict short-answer grades.


from __future__ import absolute_import, division, print_function

import argparse
import os
import random
import sys

import numpy as np
import torch
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset

from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import classification_report

from transformers import (
    BertConfig,
    BertForSequenceClassification,
    BertTokenizer,
)

from transformers import glue_compute_metrics as compute_metrics
from transformers import (
    glue_convert_examples_to_features as convert_examples_to_features,
)
from transformers.data.processors.utils import (
    DataProcessor,
    InputExample,
)

#logger = logging.getLogger(__name__)

MODEL_CLASSES = {
    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
}


def set_seed():
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)


def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    # and SemEval evaluation (unseen questions, unseen answers, unseen domains)
    eval_task_names = ("sag",
                       )

    eval_outputs_dirs = (
        (args.output_dir, )
    )

    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):

        eval_dataset = load_and_cache_examples(
            args, eval_task, tokenizer
        )

        if not os.path.exists(eval_output_dir):
            os.makedirs(eval_output_dir)

        args.eval_batch_size = 8

        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(
            eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size
        )

        # Eval!
        #logger.info("***** Running evaluation {} *****".format(prefix))
        #logger.info("  Task name = {}".format(eval_task))
        #logger.info("  Num examples = %d", len(eval_dataset))
        #logger.info("  Batch size = %d", args.eval_batch_size)

        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None

        for batch in eval_dataloader:
            #logger.info(" Starting eval for batch")
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)
            #logger.info("  Batch converted to tuple")

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                    "labels": batch[3],
                }

                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
                #logger.info("Eval loss: %d", eval_loss)
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0
                )

        #logger.info("Prediction generation done")

       # classification task; choose maximum label
        preds = np.argmax(preds, axis=1)

        # if evaluating SAG, return both accuracy and F1
        task = "sag"
        # logger.info("starting to compute metrics")
        result = my_compute_metrics(task, preds, out_label_ids)
        results.update(result)

        # print predictions made by the current model
        if args.do_print_predictions:
            print_predictions(args, preds)

        output_eval_file = os.path.join(
            eval_output_dir, prefix + "-eval_results.txt")

        #logger.info("sending output to "+str(output_eval_file));

        with open(output_eval_file, "w") as writer:
            #logger.info("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                #logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    return results


def load_and_cache_examples(args, task, tokenizer):
    examples = []

    # choose the correct processor to read the data

    processor = (
        SemEvalProcessor()
    )
    output_mode = "classification"

    #logger.info("Creating features from dataset file at %s", args.data_dir)
    label_list = processor.get_labels()

    examples = (
        processor.get_test_examples(args.data_dir)
    )

    # We are continuing to train mnli models, so task = mnli to create
    # the correct type of features
    feature_task = "mnli" if task.startswith("sag") else task

    features = convert_examples_to_features(
        examples,
        tokenizer,
        label_list=label_list,
        max_length=args.max_seq_length,
        output_mode=output_mode,
        task=feature_task
    )

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor(
        [f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor(
        [f.attention_mask for f in features], dtype=torch.long
    )
    
    all_token_type_ids = torch.tensor(
        [f.token_type_ids for f in features], dtype=torch.long
    )
    
    # do classification setup
    all_labels = torch.tensor(
        [f.label for f in features], dtype=torch.long)

    dataset = TensorDataset(
        all_input_ids, all_attention_mask, all_token_type_ids, all_labels
    )
    return dataset


def main():

    # Where are we?
    location=".";
    if getattr(sys, 'frozen', False):
        # running in a bundle
        location = sys._MEIPASS

    # open a log file next to the executable with line buffering
    #out = open("log.txt", "a", buffering=1);

    #print("Started English processing in", location, file=out);

    parser = argparse.ArgumentParser()

    # Required parameters - adapt to current directory
    parser.add_argument(
        "--data_dir",
        # default=None,
        default=location+"\\Skript\\outputs\\",
        type=str,
        # required=True,
        required=False,
        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
    )
    parser.add_argument(
        "--model_type",
        # default=None,
        default="bert",
        type=str,
        # required=True,
        required=False,
        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES),
    )
    parser.add_argument(
        "--model_name_or_path",
        # default=None,
        #default= "textattack/bert-base-uncased-MNLI",
        default=location+"\\Skript\\english\\seb-bert-mnli",
        type=str,
        # required=True,
        required=False,
        help="Path to pre-trained model",
    )

    parser.add_argument(
        "--tokenizer_name",
        default="textattack/bert-base-uncased-MNLI",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )

    parser.add_argument(
        "--output_dir",
        # default=None,
        default=location+"\\Skript\\english\\seb-bert-mnli",
        type=str,
        # required=True,
        required=False,
        help="The output directory where checkpoints will be written.",
    )

    parser.add_argument(
        "--config_name",
        default=location+"\\Skript\\english\\seb-bert-mnli\\config.json",
        type=str,
        help="Pretrained config name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        # default=128,
        default=256,
        type=int,
        help="The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument(
        # "--do_test", action="store_true", help="Whether to run eval on the test set."
        "--do_test", action="store_false", help="Whether to run eval on the test set."
    ),
    parser.add_argument(
        #"--do_print_predictions",action="store_true",help="Whether to print the model predictions for manual inspection.",
        "--do_print_predictions",
        action="store_false",
        help="Whether to print the model predictions for manual inspection.",
    ),
    parser.add_argument(
        "--do_lower_case",
        # action="store_true",
        action="store_false",
        help="Set this flag if you are using an uncased model.",
    )

    parser.add_argument(
        "--overwrite_output_dir",
        # action="store_true",
        action="store_false",
        help="Overwrite the content of the output directory",
    )

    args = parser.parse_args()

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and not args.overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )


    # Setup CPU processing
    
    device = torch.device("cpu")
    args.device = device

    # Setup logging
    #logging.basicConfig(
    #    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    #    datefmt="%m/%d/%Y %H:%M:%S",
    #    filename='log.txt',
    #    filemode='a',
    #    level=logging.INFO,
    #)

    #logger.warning(
    #    "Device: %s",
    #    device
    #)

    # Set seed to 42
    set_seed()

    processor = (
        SemEvalProcessor()
    )
    args.output_mode = (
        "classification"
    )
    label_list = processor.get_labels()
    num_labels = len(label_list)

    args.model_type = args.model_type.lower()
    #logger.info("Model %s", args.model_type)

    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else
            args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    model.to(args.device)

    #logger.info("Training/evaluation parameters %s", args)

    # Evaluation
    results = {}
    if args.do_test:
        tokenizer = tokenizer_class.from_pretrained(
            args.tokenizer_name if args.tokenizer_name else
                args.model_name_or_path,
            do_lower_case=args.do_lower_case,
        )
        checkpoints = [args.output_dir]
        #logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                "-")[-1] if len(checkpoints) > 1 else ""
            
            prefix = str(global_step)

            model = model_class.from_pretrained(checkpoint)

            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v)
                          for k, v in result.items())
            results.update(result)
    else:  # use currently active model
        result = evaluate(args, model, tokenizer, prefix="test")
        #results.update(result)
    return results


# define a new data processor for the SemEval data/SAG task


class SemEvalProcessor(DataProcessor):
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train"
        )

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev"
        )

    def get_test_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test"
        )

    def get_labels(self):
        """See base class."""
        return ["correct", "incorrect", "NONE"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the test set."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = line[0]
            text_a = line[1]
            text_b = line[2]
            label = line[-1]
            examples.append(
                InputExample(guid=guid, text_a=text_a,
                             text_b=text_b, label=label)
            )
            
        return examples


# custom metrics for SAG: F1 and Accuracy


def my_compute_metrics(eval_task, preds, labels):
    result = {}
    if eval_task == "sag":
        acc = accuracy_score(y_pred=preds, y_true=labels)
        f1_weighted = f1_score(y_pred=preds, y_true=labels, average="weighted")
        f1_macro = f1_score(y_pred=preds, y_true=labels, average="macro")
        result = {"f1-weighted": f1_weighted,
                  "f1-macro": f1_macro, "accuracy": acc}
    else:
        result = compute_metrics(eval_task, preds, labels)
    return result


def print_predictions(args, preds):
    # generate data set part of output path
    dir_name = (""
                )

    # get examples
    processor = (
        SemEvalProcessor()
    )

    examples = (
        processor.get_test_examples(args.data_dir)
    )

    # observed grade list created
    obs_grade = [ex.label for ex in examples]

    # suggested grade list created
    sugg_grade = ['correct' if pred == 0 else 'incorrect' for pred in preds]

    # flag: do observed grades exist?
    count=0

    # Check if obs_grade contains "NONE" values or is empty
    if not obs_grade or all(grade == 'NONE' for grade in obs_grade):
        count += 1

    else:
        # classification report
        classification_rep = classification_report(obs_grade, sugg_grade)

        report_string = classification_rep

        report_lines = report_string.split('\n')

        # print(report_lines)

        # accuracy line
        formatted_accuracy_line = "\t".join(report_lines[5].split())
        formatted_acc_line_with_tabs = (formatted_accuracy_line[:formatted_accuracy_line.index('\t',
                                                                                               formatted_accuracy_line.index(
                                                                                                   '\t'))] + '\t\t' +
                                        formatted_accuracy_line[
                                        formatted_accuracy_line.index('\t', formatted_accuracy_line.index('\t')):])

        # #weighted avg printing
        #
        wt_avg_line = "\t".join(report_lines[7].split())

        new_wt_avg_line = wt_avg_line.replace("\t", " ", 1)

        # Join the entire newly formatted list into a single string
        formatted_output = "\n".join([
            "\t precision \t recall \t f1-score \t support",
            "\t".join(report_lines[2].split()),
            "\t".join(report_lines[3].split()),
            formatted_acc_line_with_tabs,
            new_wt_avg_line
        ])

    with open(args.data_dir + "/" + dir_name + "/predictions.txt", "w", encoding="utf8") as writer:

        writer.write(
            "question\treferenceAnswer\tstudentAnswer\tsuggested grade\tobserved grade\n")
        for i in range(len(examples)):

            # iterate over data
            # print prediction as a text-based label
            hrpred = "incorrect"
            if preds[i] == 0:
                hrpred = "correct"

            # get guid, text, from inputExample
            writer.write(
                str(examples[i].guid)
                + "\t"
                + examples[i].text_a
                + "\t"
                + examples[i].text_b
                + "\t"
                + hrpred
                + "\t"
                + examples[i].label
                + "\n"
            )

        if count == 1:
            writer.write("\nClassification Report cannot be printed as observed grade column is empty or filled "
                             "with 'NONE' or 'none' values\n")
        else:

                # Write the classification report to the file

            writer.write(
                    "\nClassification Report - high Precision for classes correct or incorrect indicates that the class prediction is reliable:\n")
            writer.write(formatted_output)


if __name__ == "__main__":
    main()