# coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Code is originally from https://github.com/nlpyang/pytorch-transformers/blob/master/examples/run_glue.py # Adapted to the SAG task by Ulrike Pado, HFT Stuttgart: Run a fine-tuned model on given input data to predict short-answer grades. from __future__ import absolute_import, division, print_function import argparse import os import random import sys import numpy as np import torch from torch.utils.data import DataLoader, SequentialSampler, TensorDataset from sklearn.metrics import f1_score, accuracy_score from sklearn.metrics import classification_report from transformers import ( BertConfig, BertForSequenceClassification, BertTokenizer, ) from transformers import glue_compute_metrics as compute_metrics from transformers import ( glue_convert_examples_to_features as convert_examples_to_features, ) from transformers.data.processors.utils import ( DataProcessor, InputExample, ) #logger = logging.getLogger(__name__) MODEL_CLASSES = { "bert": (BertConfig, BertForSequenceClassification, BertTokenizer), } def set_seed(): random.seed(42) np.random.seed(42) torch.manual_seed(42) def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) # and SemEval evaluation (unseen questions, unseen answers, unseen domains) eval_task_names = ("sag", ) eval_outputs_dirs = ( (args.output_dir, ) ) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples( args, eval_task, tokenizer ) if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) args.eval_batch_size = 8 # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size ) # Eval! #logger.info("***** Running evaluation {} *****".format(prefix)) #logger.info(" Task name = {}".format(eval_task)) #logger.info(" Num examples = %d", len(eval_dataset)) #logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in eval_dataloader: #logger.info(" Starting eval for batch") model.eval() batch = tuple(t.to(args.device) for t in batch) #logger.info(" Batch converted to tuple") with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "labels": batch[3], } outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() #logger.info("Eval loss: %d", eval_loss) nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0 ) #logger.info("Prediction generation done") # classification task; choose maximum label preds = np.argmax(preds, axis=1) # if evaluating SAG, return both accuracy and F1 task = "sag" # logger.info("starting to compute metrics") result = my_compute_metrics(task, preds, out_label_ids) results.update(result) # print predictions made by the current model if args.do_print_predictions: print_predictions(args, preds) output_eval_file = os.path.join( eval_output_dir, prefix + "-eval_results.txt") #logger.info("sending output to "+str(output_eval_file)); with open(output_eval_file, "w") as writer: #logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): #logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results def load_and_cache_examples(args, task, tokenizer): examples = [] # choose the correct processor to read the data processor = ( SemEvalProcessor() ) output_mode = "classification" #logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = ( processor.get_test_examples(args.data_dir) ) # We are continuing to train mnli models, so task = mnli to create # the correct type of features feature_task = "mnli" if task.startswith("sag") else task features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, task=feature_task ) # Convert to Tensors and build dataset all_input_ids = torch.tensor( [f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor( [f.attention_mask for f in features], dtype=torch.long ) all_token_type_ids = torch.tensor( [f.token_type_ids for f in features], dtype=torch.long ) # do classification setup all_labels = torch.tensor( [f.label for f in features], dtype=torch.long) dataset = TensorDataset( all_input_ids, all_attention_mask, all_token_type_ids, all_labels ) return dataset def main(): # Where are we? location="."; if getattr(sys, 'frozen', False): # running in a bundle location = sys._MEIPASS # open a log file next to the executable with line buffering #out = open("log.txt", "a", buffering=1); #print("Started English processing in", location, file=out); parser = argparse.ArgumentParser() # Required parameters - adapt to current directory parser.add_argument( "--data_dir", # default=None, default=location+"\\Skript\\outputs\\", type=str, # required=True, required=False, help="The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--model_type", # default=None, default="bert", type=str, # required=True, required=False, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES), ) parser.add_argument( "--model_name_or_path", # default=None, #default= "textattack/bert-base-uncased-MNLI", default=location+"\\Skript\\english\\seb-bert-mnli", type=str, # required=True, required=False, help="Path to pre-trained model", ) parser.add_argument( "--tokenizer_name", default="textattack/bert-base-uncased-MNLI", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--output_dir", # default=None, default=location+"\\Skript\\english\\seb-bert-mnli", type=str, # required=True, required=False, help="The output directory where checkpoints will be written.", ) parser.add_argument( "--config_name", default=location+"\\Skript\\english\\seb-bert-mnli\\config.json", type=str, help="Pretrained config name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", # default=128, default=256, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( # "--do_test", action="store_true", help="Whether to run eval on the test set." "--do_test", action="store_false", help="Whether to run eval on the test set." ), parser.add_argument( #"--do_print_predictions",action="store_true",help="Whether to print the model predictions for manual inspection.", "--do_print_predictions", action="store_false", help="Whether to print the model predictions for manual inspection.", ), parser.add_argument( "--do_lower_case", # action="store_true", action="store_false", help="Set this flag if you are using an uncased model.", ) parser.add_argument( "--overwrite_output_dir", # action="store_true", action="store_false", help="Overwrite the content of the output directory", ) args = parser.parse_args() if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) # Setup CPU processing device = torch.device("cpu") args.device = device # Setup logging #logging.basicConfig( # format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", # datefmt="%m/%d/%Y %H:%M:%S", # filename='log.txt', # filemode='a', # level=logging.INFO, #) #logger.warning( # "Device: %s", # device #) # Set seed to 42 set_seed() processor = ( SemEvalProcessor() ) args.output_mode = ( "classification" ) label_list = processor.get_labels() num_labels = len(label_list) args.model_type = args.model_type.lower() #logger.info("Model %s", args.model_type) config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) model.to(args.device) #logger.info("Training/evaluation parameters %s", args) # Evaluation results = {} if args.do_test: tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, ) checkpoints = [args.output_dir] #logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" prefix = str(global_step) model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) else: # use currently active model result = evaluate(args, model, tokenizer, prefix="test") #results.update(result) return results # define a new data processor for the SemEval data/SAG task class SemEvalProcessor(DataProcessor): def get_train_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "train.tsv")), "train" ) def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev" ) def get_test_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.tsv")), "test" ) def get_labels(self): """See base class.""" return ["correct", "incorrect", "NONE"] def _create_examples(self, lines, set_type): """Creates examples for the test set.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = line[0] text_a = line[1] text_b = line[2] label = line[-1] examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) ) return examples # custom metrics for SAG: F1 and Accuracy def my_compute_metrics(eval_task, preds, labels): result = {} if eval_task == "sag": acc = accuracy_score(y_pred=preds, y_true=labels) f1_weighted = f1_score(y_pred=preds, y_true=labels, average="weighted") f1_macro = f1_score(y_pred=preds, y_true=labels, average="macro") result = {"f1-weighted": f1_weighted, "f1-macro": f1_macro, "accuracy": acc} else: result = compute_metrics(eval_task, preds, labels) return result def print_predictions(args, preds): # generate data set part of output path dir_name = ("" ) # get examples processor = ( SemEvalProcessor() ) examples = ( processor.get_test_examples(args.data_dir) ) # observed grade list created obs_grade = [ex.label for ex in examples] # suggested grade list created sugg_grade = ['correct' if pred == 0 else 'incorrect' for pred in preds] # flag: do observed grades exist? count=0 # Check if obs_grade contains "NONE" values or is empty if not obs_grade or all(grade == 'NONE' for grade in obs_grade): count += 1 else: # classification report classification_rep = classification_report(obs_grade, sugg_grade) report_string = classification_rep report_lines = report_string.split('\n') # print(report_lines) # accuracy line formatted_accuracy_line = "\t".join(report_lines[5].split()) formatted_acc_line_with_tabs = (formatted_accuracy_line[:formatted_accuracy_line.index('\t', formatted_accuracy_line.index( '\t'))] + '\t\t' + formatted_accuracy_line[ formatted_accuracy_line.index('\t', formatted_accuracy_line.index('\t')):]) # #weighted avg printing # wt_avg_line = "\t".join(report_lines[7].split()) new_wt_avg_line = wt_avg_line.replace("\t", " ", 1) # Join the entire newly formatted list into a single string formatted_output = "\n".join([ "\t precision \t recall \t f1-score \t support", "\t".join(report_lines[2].split()), "\t".join(report_lines[3].split()), formatted_acc_line_with_tabs, new_wt_avg_line ]) with open(args.data_dir + "/" + dir_name + "/predictions.txt", "w", encoding="utf8") as writer: writer.write( "question\treferenceAnswer\tstudentAnswer\tsuggested grade\tobserved grade\n") for i in range(len(examples)): # iterate over data # print prediction as a text-based label hrpred = "incorrect" if preds[i] == 0: hrpred = "correct" # get guid, text, from inputExample writer.write( str(examples[i].guid) + "\t" + examples[i].text_a + "\t" + examples[i].text_b + "\t" + hrpred + "\t" + examples[i].label + "\n" ) if count == 1: writer.write("\nClassification Report cannot be printed as observed grade column is empty or filled " "with 'NONE' or 'none' values\n") else: # Write the classification report to the file writer.write( "\nClassification Report - high Precision for classes correct or incorrect indicates that the class prediction is reliable:\n") writer.write(formatted_output) if __name__ == "__main__": main()