#!/usr/bin/perl

use Getopt::Std; #for handling command line options

# Program : roc
# Purpose : To calculate the ROC (or to be more accurate, AUC, the area under curve)
#		The input file for this program is the output of the program "preproc.pl". 
#	    Output: AUC value
#
#  	    You can also print out the detail data , such as TPR, FPR and cutoff, by specifying the output file.
#           These data can be used in Microsoft Excel for drawing the ROC graph.
#
# Author  : Jumail Bin Taliba (jumail@utm.my)
# Date    : 20 September 2008
#
# Usage: 
#       ./roc.pl  -m model -f fold_count roc_infile [output_file]
# e.g. : 
#	./roc.pl roc_sample_input.txt roc_curve_data_outfile.txt
#	./roc.pl -f 13 -m inc roc_sample_input.txt

sub SyntaxUsage
{
	my ($prompt) = @_;
	
	print STDERR ("\n\n".$prompt."\n");
	print STDERR "\n\nUsage:\n $0 -m model -f fold_count roc_infile [output_file]\n\n";
	print STDERR ("Options:\n\n");
	print STDERR ("-m\tmodel type = (inc or dec)\n");
	print STDERR ("-f\tnumber of cross validation\n");
	print STDERR ("-o\toutput file (i.e. data to draw roc curve\n");
	print STDERR ("\n\n");
	print STDERR ("Each row in the input file has two columns.\n 1) Prediction value.\n 2) True class (i.e. +1 = positive and -1=negative).\n");
	print STDERR ("\n\n");
	print STDERR ("Example:\n");
	print STDERR ("$0 roc_sample_input.txt roc_curve_data_outfile.txt\n");
	print STDERR ("$0 roc.pl -f 13 -m inc roc_sample_input.txt\n");
	print STDERR ("\n\n");
	exit;
}



getopt('mf');

SyntaxUsage("roc.pl: To calcuate the mean ROC") if (scalar(@ARGV)==0);

my $model = lc($opt_m) eq 'dec' ? 'dec': 'inc'; # inc=.e.g pearson's correlation, mi,svm.    dec=e.g. rmsd
my $fold_count = $opt_f ? $opt_f : 1;
my $roc_infile = shift @ARGV;
my $roc_graph_outfile= shift @ARGV;

SyntaxUsage("*** The roc input file '$roc_infile' doesn't exist") if (!$roc_infile || !-f $roc_infile);

my @prediction_list = LoadInputFile($roc_infile);

my $required_size  = $fold_count*$data_per_fold;
my $available_size = scalar(@prediction_list);
my $data_per_fold  = $available_size / $fold_count;

unlink ($roc_graph_outfile) if ($roc_graph_outfile); # Firstly, remove the existing file, since we use appending mode for writing the file

my $sum_auc = 0;
my $sum_err = 0;
my $sum_rfp = 0;

for (my $i=0; $i<$fold_count; $i++)
{
	my @this_prediction_list = splice(@prediction_list,0, $data_per_fold);
	my $prediction_count = @this_prediction_list;
	my @roc_table = ();
	
	my ($auc,$err,$rfp) = CalculateAUC(\@this_prediction_list, \@roc_table);

	#print 'Fold count'."\t".'AUC'."\t".'ERR'."\t".'RFP'"\n";
	print 'Fold_'.($i+1)."\t".$auc."\t".$err."\t".$rfp."\n";	#add RFP value at the end
	SaveGraphData($roc_graph_outfile, 'Fold '.($i+1),\@roc_table) if ($roc_graph_outfile);

	$sum_auc += $auc;
	$sum_err += $err;
	$sum_rfp += $rfp;
}

print "\n";
print "Mean ROC\t".($sum_auc/$fold_count)."\n";
print "Mean ERR\t".($sum_err/$fold_count)."\n";
print "Mean RFP\t".($sum_rfp/$fold_count)."\n";	#mRFP result goes here
#print "TN\t".($tn_count)."\n";	#TN result goes here $prev_fpr

print "\n";

print STDERR "\n\nThe output file has been written into '$roc_graph_outfile'\n" if ($roc_graph_outfile);

# To calculate the area of under the curve

sub CalculateAUC
{
	my ($inlist, $outlist) = @_;   # inlist => first column=prediction_value, second colum=target_class
	                               # outlist=> first column=TPR, second column=FPR
	
	@{$outlist} = ();
	
	
	# Sort the list by the prediction value so that it easy to determine the cutoff
	# Also, make a copy of inlist because we don't want to touch it
	
	my @list = sort {$a->[0] <=> $b->[0]} @{$inlist};
	@list = reverse @list if ($model eq 'inc');
	
	# calculate the total of TP and TN
	my $tp_total=0;
	$tp_total += $_->[1] foreach(@list);
	my $list_count= scalar(@list); # total number of predictions
	my $tn_total= $list_count - $tp_total;
	
	my $sum_area = 0; # To calculate AUC
	my $err=-1;	  # To calculate Error Rate
	my $tp_count = 0; # sum of TP up to the current cutoff
	my $tn_count = 0; # sum of TN up to the current cutoff
	
	my $min_dist_sens_spec = 10; # initialize with a big one
	
	my $prev_tpr;
	my $prev_fpr;

	for (my $i=0; $i<$list_count; $i++)
	{
		my $tp = $list[$i][1]; # True Positive (1= true, 0=false)
		my $tn = $tp == 0 ? 1: 0; # True Negative
		
		$tp_count += $tp;
		$tn_count += $tn;
		
		my $tpr = $tp_count/$tp_total;
		my $fpr = $tn_count/$tn_total;
		
		my $spec = 1-$fpr;
		my $sens = $tpr;
		my $sens_spec = abs($spec-$sens); 
		
		if ($sens_spec < $min_dist_sens_spec)
		{
			$err = ($tp_count + $tn_count)/($tp_total+$tn_total);
			$min_dist_sens_spec = $sens_spec;
		}
		
		my $area;    # trapezoid area = (x2-x1) * y1 + ((y2-y1)/2)           ROC Curves: y-axis = TPR, x-axis= FPR
		
		#$sum_err += ($tp_count + $tn_count)/($tp_total+$tn_total);
		#$sum_err = ($tp_count + $tn_count)/($tp_total+$tn_total);
		
		if ($i==0)
		{
			$area = 0;
		}
		else
		{
			$area = ($fpr - $prev_fpr) * ($prev_tpr + ($tpr-$prev_tpr)/2 );
		}


		$sum_area += $area;

		push(@{$outlist}, [$tpr,$fpr]);
		
		$prev_tpr = $tpr;
		$prev_fpr = $fpr;
	}
	
	return ($sum_area,$err,$tn_count);
}

sub TrimNewline
{
	my ($line) = @_;
	
	# To handle different types of file format created by different platforms
	$line =~ s/\r\n//g; # DOS
	$line =~ s/\r//g; # MAC
	$line =~ s/\n//g; # Unix
	
	return $line;
}

# To load the roc input file and return the list of predictions (format of each pair $pair[i]=[prediction_value, target_class]
# sample input file

# 0.863553243713228	-1
# 0.794411903589391	-1
# 0.893361391565565	+1
# 0.942820799124522	+1
# 0.90623139139468	-1

sub LoadInputFile
{
	my ($input_file) = @_;
	my @list = ();
	my $line;
	
	open(fin, "<$input_file") || die ("Unable to open file '$input_file' for reading");
	
	while (defined ($line=<fin>))
	{
		$line = TrimNewline($line);
		
		next if (!$line); # skip blank lines
		my ($prediction, $target_class) = split(/\s+/,$line);
		
		$target_class = $target_class eq '-1' ? 0:1;  # Map the symbols used. In file: TP=+1, TN=-1. In calculation: TP=1, TN=0;
		                                               
		push(@list, [$prediction, $target_class]);
	}
	
	close(fin);

	return @list;
	
}


# To write detail output to a file so that later it can be used in Microsoft Excel for drawing the ROC curve.

sub SaveGraphData
{
	my ($output_file, $title,$list) = @_; #list is a pointer to arrays
	
	open(fout, ">>$output_file") || die ("Unable to open file '$output_file' for writing");
	
	# First, print the table header
	print fout "$title\n";
	print fout "TPR\tFPR\n";
	print fout join( "\t",@{$_} )."\n"  foreach (@{$list});
	print fout "\n";
	close(fout);
}
