#!/usr/bin/perl
use strict;

# Program : cross_dataset
# Purpose : To prepare testing and training dataset of the SVMLight for cross-validation expriments
# Author  : Jumail Bin Taliba (jumail@utm.my)
# Date    : 02 Nov 2008

# Usage: 
#        cross_dataset.pl  number_of_cross_exp   num_of_repeat  postv_sample_per_set   negtv_sample_per_set input_file output_dir
# e.g. : 
#	./cross_dataset.pl  4 25 500 500  all_samples.dat svm_dataset
#	./cross_dataset.pl  4 25 -1 -1  all_samples.dat svm_dataset



my ($num_cross_exp, $num_repeat, 
    $positive_per_fold, $negative_per_fold,
    $input_file, 
    $output_dir ) = @ARGV;

my @positive_samples = (); # to store the order of pairs read. access. $positive_samples[0] =[svm_line_text,line_number,class,random_value];
my @negative_samples = ();

my @selected_positives = (); # to be used for creating testing and training set
my @selected_negatives = (); # to be used for creating testing and training set

die("Input file doesn't exist: $input_file") if (!$input_file || !-e $input_file);

system ("mkdir $output_dir") if ($output_dir && ! -d $output_dir);

Prompt('Loading input file ....');
LoadInputFile($input_file);
Prompt("Done\n");

my $available_positive_sample = @positive_samples;
my $available_negative_sample = @negative_samples;

$positive_per_fold = int($available_positive_sample/$num_cross_exp) if ( $positive_per_fold <=0 );
$negative_per_fold = int($available_negative_sample/$num_cross_exp) if ( $negative_per_fold <=0 );

my $required_positive_sample = $num_cross_exp * $positive_per_fold;
my $required_negative_sample = $num_cross_exp * $negative_per_fold;

die("Invalid required sample") if (!$required_positive_sample || !$required_negative_sample);

die("Required positive pair sample exceeds the available: $required_positive_sample > $available_positive_sample") if ($required_positive_sample > $available_positive_sample);
die("Required negative pair sample exceeds the available: $required_negative_sample > $available_negative_sample") if ($required_negative_sample > $available_negative_sample);

my $number = 1; # expriment number

for (my $repeat=0; $repeat<$num_repeat; $repeat++)
{
	@selected_positives = SelectRandomSamples(\@positive_samples,$required_positive_sample);
	@selected_negatives = SelectRandomSamples(\@negative_samples, $required_negative_sample);
	
	my @cross_positive_dataset = (); # access => $cross_dataset[0] = [pair list]
	my @cross_negative_dataset = ();
	
	# split the selected pairs into their cross exp
	
	for (my $cross=0; $cross < $num_cross_exp; $cross++)
	{
		push(@{$cross_positive_dataset[$cross]}, ( splice(@selected_positives,0, $positive_per_fold) ) );
		push(@{$cross_negative_dataset[$cross]}, ( splice(@selected_negatives,0, $negative_per_fold) ) );
	}
	
	# rotate the cross dataset (1 set for testing, the rest for training)
	
	for (my $i= 0; $i < $num_cross_exp; $i++)
	{
		my @testing_list = ();
		my @training_list = ();
		
		push(@testing_list, @{$cross_positive_dataset[$i]}); # 1 set is for testing
		push(@testing_list, @{$cross_negative_dataset[$i]}); # 1 set is for testing
		
		for (my $j= 0; $j< $num_cross_exp; $j++) # positive sample
		{
			next if ($j==$i); # skip if the dataset has already been taken as testing set
			push(@training_list, @{$cross_positive_dataset[$j]}); # 1 sample is for testing
			push(@training_list, @{$cross_negative_dataset[$j]}); # 1 sample is for training
			
		}
		
		my $print_number = "$number";
		#$print_number ='0'.$print_number if ($number<10);
		#$print_number =('0' x (3-length($print_number)) ).$number;
		
  		my $expriment_file  = $output_dir.'/expriment'.$print_number.'.log';
  		my $training_file  = $output_dir.'/train'.$print_number.'.svm';
  		my $testing_file  = $output_dir.'/test'.$print_number.'.svm';

  		Prompt("Writing expriment file: $expriment_file");
  		SaveExprimentInfoToFile($expriment_file,$testing_file, $training_file, \@testing_list, \@training_list);
  
  		my $total_exp = $num_cross_exp*$num_repeat;
  		
  		print "\nWriting testing data file: $testing_file\n";
  		
  		SaveSamplesToFile($testing_file,\@testing_list, "Testing: Expriment #$number of $total_exp");
  
  		Prompt("Writing traning data file: $training_file");
  		SaveSamplesToFile($training_file,\@training_list, "Training: Expriment #$number of $total_exp");
		
		$number++;	
	}
}


# To selet n number of samples randomly

sub SelectRandomSamples
{
	my ($ref_list, $n) = @_;
	my @result = ();
	
	# firstly, assign a random number to each pair
	
	my $count = @{$ref_list};
	
	for (my $i=0; $i<$count; $i++)
	{
		$ref_list->[$i][3] = rand();
	}
	
	# then sort the list according the assigned number
	@result = sort {$a->[3] <=> $b->[3]} @{$ref_list};

	# finally take the first n samples from the sorted list

	@result = splice(@result, 0, $n);
	
	return @result;
}

sub LoadInputFile
{
	my ($infile) = @_; # The input file is in SVMLight input format
	
	open(fin, "<$infile")   || die ('**Error: Unable to open input file');
		
	my $line;
	my $line_number = 0;
	
	while ($line = <fin> )
	{
		$line_number ++;
		$line = TrimNewline($line);
		
		my ($target) = split(/\s+/, $line); # take the class, i.e. from the first column
		
		($target eq -1) ? push(@negative_samples,[$line,$line_number,-1,0]): push(@positive_samples,[$line,$line_number,1,0] )
	}
	close(fin);
}


sub WriteDatasetInfo
{
	local *fout = shift;
	my $ref_list = shift;

	my $count = @{$ref_list};
	
	print fout "Sample\t$count\n\n";

	for (my $j=0; $j<$count; $j++)
	{
		my $sample_id = $ref_list->[$j][1]; 
		my $class = $ref_list->[$j][2] eq -1  ? '-1' :'+1';

		my $seq = $j+1;

		print fout "$seq\.\t$sample_id\t$class\n";
	}
	
}

sub SaveExprimentInfoToFile
{
	my ($output_file, $test_file,$train_file, $ref_testing, $ref_training) = @_;
	open(fout, ">$output_file") || die ("Unable to open file '$output_file' for writing");

	#Writing Testing Dataset Info

	print fout "[Testing Dataset]\n\n";
	print fout "File\t$test_file\n\n";

	WriteDatasetInfo(*fout, $ref_testing);
	
	#Writing Training Dataset Info

	print fout "\n[Training Dataset]\n";
	print fout "File\t$train_file\n\n";

	WriteDatasetInfo(*fout, $ref_training);

	close(fout);
}

sub SaveSamplesToFile
{
	my ($output_file,$ref_list, $prompt) = @_;
	open(fout, ">$output_file") || die ("Unable to open file '$output_file' for writing");

	my $list_count = @{$ref_list};

	for (my $j=0; $j<$list_count; $j++)
	{
		my $line = $ref_list->[$j][0];

		my $no = $j+1;
		Prompt("\tSample #$no of $list_count:\t ($prompt)");
		print fout "$line\n";
	}
	close(fout);
}

sub TrimNewline
{
	my ($line) = @_;
	
	# To handle different types of file format created by different platforms
	$line =~ s/\r\n//g; # DOS
	$line =~ s/\r//g; # MAC
	$line =~ s/\n//g; # Unix
	
	return $line;
}

sub Now
{
	my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
	$year += 1900;
	return "$mday-$mon-$year $hour:$min:$sec";
}

sub Prompt
{
	my ($msg, $print_time) = @_;
	
	print STDERR Now().':' if ($print_time);
	print STDERR $msg."\n";
}