#!/usr/bin/perl -w
use strict;
use warnings;
use Data::Dumper::Names;
use Data::Dump qw(dump);
#use Data::Dumper::Names;

################### This is the first Perl file to be run. Input: see below, output: an .arff file to be used in the software WEKA that contains the computed features for each text fragment 
################### In order to run this code correctly you should check if the libraries Text::Ngram, List::Util and Encode (and maybe other check the name after each 'use' if the code generates any error of unknown function)are installed in your Perl directory
################### Other libraries created by ourselves (the folder InAraCorpus) should be copied to your_perl_directory\site\lib\InAraCorpusLib. Otherwise the code will not work 
################### Configure the method as it is shown just below in the comments of the call of the function: Ngram_method_training_main_in_document_set
my $nn= 6; # ngram length
my$mm = 4;  # number of classes
my $num = 1; # file number(used in case the corpus is devided to parts and many perl files lanched together)

my %sliding_windows_parameters = ( 0 => [100,25], 600 => [200,50], 1800 => [400,100] );
my $segments_dumped_directory = 'F:\Segments_dumps\Win600-1800\segments'; # used in case $segmentation_source = 'retrieve_stored_segments';
my $segmentation_source = 'segment_only';
                          #'retrieve_stored_segments';
			              #'segment&store';
			              #'segmentation_from_stored_ngrams';

my $ngrams_dumped_directory = 'F:\Imene_doc\Doctorat\Implementation\segments_ngrams_dumps'.'\\'.$nn.'gram'; # used in case $ngrams_source = 'retrieve_stored_ngrams';
#'C:\Users\Pc\Desktop'.'\\'.$nn.'gram';
my $ngrams_source = 'ngramer_only';
                    #'retrieve_stored_ngrams';
			        #'ngramer&store';
			

Ngram_method_training_main_in_document_set  ( #suspicious documents directory path
                                                'C:\Corpora\testing\diveded\suspicious_doc-part'.$num,
												
											  #suspicious documents annotation directory path
                                                'C:\Corpora\testing\all\pan11-plagiarism-intrinsic-competition-collection-annotations',
												
											  # Proposed name for the generated features dataset. It will be concatenated to the number of ngrams and the class number to forme the full name
						                      'PAN11TestSliWin3sizes_'.$num.'.arff', 
											  
											  #where to save results
                                                'F:\Imene_doc\Doctorat\Implementation\Training_sets-gram-frequency-class-method\onPAN'.'\\'.$nn.'g'.$mm.'cWin', 
						
						                      $nn, # n : n-grams length
                                              $mm, # m: number of n-gram frequency classes
                                            );


sub Ngram_method_training_main_in_document_set  #OK 
{	####### This function is language independant
    
    my $start1 = time;
    use strict;
    use warnings;
    use PAN2011CorpusLib::PANEvaluation;
     
    my $suspicious_doc_directory_path = $_[0];
    my $annotation_directory_path = $_[1];
    my $features_file_name = $_[2];
    my $features_file_path= $_[3];
    my $n = $_[4];
    my $number_of_classes = $_[5];
   
    
    
    unless(-e $features_file_path or mkdir $features_file_path)
        {   die "Unable to create $features_file_path\n";}
	
    
    
    
    ####### create a directory to save the segments for each file	 
    if($segmentation_source eq 'segment&store')
    { 	unless(-e $segments_dumped_directory or mkdir $segments_dumped_directory)
        {   die "Unable to create $segments_dumped_directory\n";}
    }
    #######
    
    ####### create a directory to save the ngrams for each file	 
    if($ngrams_source eq 'ngramer&store')
    { 	unless(-e $ngrams_dumped_directory or mkdir $ngrams_dumped_directory)
        {   die "Unable to create $ngrams_dumped_directory\n";}
    }
    #######
    
    $features_file_name = $features_file_path.'\\'.$n.'gram-'.$number_of_classes.'classes-'.$features_file_name;
    
    ########## Print the parameters in a file
    my $para_file_path = $features_file_path.'\\'.'method_prameters.txt';
    open my $prameters_file , ">", $para_file_path or die $! , "impossible to create $para_file_path";
    print $prameters_file Dumper(  $suspicious_doc_directory_path, $annotation_directory_path, $features_file_name, $n, 
				   $number_of_classes, \%sliding_windows_parameters , $segments_dumped_directory , $segmentation_source,
				   $ngrams_dumped_directory,$ngrams_source);
    close $prameters_file;
    #############################################
    
    my @features_to_print;
    push @features_to_print, [1,'offset','NUMERIC'];
    push @features_to_print, [1,'length','NUMERIC'];
    for (my $i=0; $i < $number_of_classes ;$i++)
    {   push @features_to_print, ['style_model', $i,'NUMERIC']; }
    push @features_to_print,[1,'plagiarism','{0,1}'];

    create_arff_file_head($features_file_name,\@features_to_print);
    
    opendir(DIR, $suspicious_doc_directory_path) or die "can't opendir $suspicious_doc_directory_path: $!";
    while (defined(my $file_name = readdir(DIR)))
    {   next unless ($file_name =~ m/\.txt$/);
        my $suspicious_doc_path = $suspicious_doc_directory_path.'\\'.$file_name;
        my $annotation_file_path = $annotation_directory_path.'\\'.$file_name;
	my $segments_dump_file = $segments_dumped_directory.'\\'.$file_name;
	my $ngrams_dump_file = $ngrams_dumped_directory.'\\'.$file_name;

        $annotation_file_path =~ s/\.txt$/.xml/;
        #print "My document is:\t $file_name\n";
        my %plagiarism_detection_data   =   (   'suspicious_document'  => $suspicious_doc_path,
                                                'metadata_file'        => $annotation_file_path,
						'dumped_segments_file' => $segments_dump_file,
						'dumped_ngrams_file'   => $ngrams_dump_file,
					    );
        print "$file_name\n";
	my @document_segments_model = Ngram_method_training_main_in_one_file(\%plagiarism_detection_data, $n, $number_of_classes);
	#print Dumper @document_model;
	
       
	### the function below: print_hashs_list_in_file print_hashs_list_in_file: a function to print the choosen features in a file. Their aguments are respectively: 
	###   list of hashs which have the same keys;
	###   values of similar keys will be printed in the same column;
	###   full path of the  file to create.   
	print_hashs_list_in_file(\@document_segments_model, \@features_to_print, $features_file_name, $file_name );
   	@document_segments_model = ();
   }
    my $duration1 = time-$start1;
    my $duration_min1 = sprintf("%d",$duration1/60);
    print "\nRunning duration In all the corpus($duration1 sec):\t",$duration_min1," min ",$duration1 % 60," sec","\n";
    print "Processing completed: The file $features_file_name is generated in the same directory of this program.";
   
}




sub Ngram_method_training_main_in_one_file #I adapt the segmentation to recieve an argument on language and subtitute the segmentation package of InAra by the one of PAN
{   # my $start2 = time;

    use strict;
    use warnings;
    use PAN2011CorpusLib::PANTextSegmentation;
    use PAN2011CorpusLib::PANEvaluation;
    #use Data::Dumper;
    use FileOperation::PANMetadataFiles;
    use PAN2011CorpusLib::PANSlidingWindow;
    use Storable;
    
    my %file_paths_setting = %{$_[0]}; #suspicious_document
    my $n = $_[1];
    my $number_of_classes = $_[2];
        
    my @suspicious_document_segments;
    #my $file_language = get_document_language($file_paths_setting{'metadata_file'}); #*
    
    
 ####### Segmentation
 #   	my @suspicious_document_segments = document_segmentation_to_sentences( %file_paths_setting,
 #									  'language'=>  $file_language,#*
 #									  'include_sentences' => 1 #*
 #	    );
    if ( $segmentation_source eq 'segment&store' )
    {	
	@suspicious_document_segments = create_segments_via_words_sliding_window_from_file(  $file_paths_setting{'suspicious_document'},
											    \%sliding_windows_parameters, # a hash that contains options of win size, step
											    1   # include text 
    										  	   );
	store \@suspicious_document_segments, $file_paths_setting{'dumped_segments_file'};
    }
    elsif ( $segmentation_source eq 'segment_only' )
    {
	@suspicious_document_segments = create_segments_via_words_sliding_window_from_file(  $file_paths_setting{'suspicious_document'},
											    \%sliding_windows_parameters, # a hash that contains options of win size, step
											    1   # include text 
    										  	   );
    }
    elsif ($segmentation_source eq 'retrieve_stored_segments')
    {	@suspicious_document_segments = @{retrieve($file_paths_setting{'dumped_segments_file'})};	}
    
    
    ####### feature computing
	
    ngram_frequency_model_extraction(\@suspicious_document_segments,$n, \%file_paths_setting, $number_of_classes); 
    #dump @suspicious_document_segments;
    #print "\n";
    ######## segment annotation with real plagiarism to use the resulted dataset in training  
    
    # the function plagiarism_annotation_of_segments_list anotates each segment with a value between 0 and 1 representing the ratio of plagiarism length to the segment length
    # the argument of plagiarism_annotation_of_segments_list function are : @segments_list , xml_file path , $text_file path
    @suspicious_document_segments = plagiarism_annotation_of_segments_list(\@suspicious_document_segments,$file_paths_setting{'metadata_file'});#*
    #@suspicious_document_segments = plagiarism_annotation_of_segments_list(\@suspicious_document_segments,$file_paths_setting{'metadata_file'},$file_paths_setting{'suspicious_document'});

    STDOUT->autoflush(1);
    #my $duration2 = time-$start2;
    #my $duration_min2 = sprintf("%d",$duration2/60);
    #print "($duration2 sec):\t",$duration_min2," min ",$duration2 % 60," sec","\n";
    return @suspicious_document_segments;
}

sub ngram_frequency_model_extraction #OK I check all the function that comprise
{   
    use strict;
    use warnings;
    use Storable;
    
    STDOUT->autoflush(1); 
    my $suspicious_document_segments = $_[0]; #pointer to an array
    my $n = $_[1];
    my $file_paths_setting = $_[2]; # a hash of setting
    my $number_of_class = $_[3];
    
    ############# extract ngrams with their frequencies for each segment in the document
    if ( $ngrams_source eq 'ngramer&store')#;
    {	string_list_character_ngramer($suspicious_document_segments, $n);
        store $suspicious_document_segments, $$file_paths_setting{'dumped_ngrams_file'};
    }
    elsif ( $ngrams_source eq 'retrieve_stored_ngrams')
    {	@{$suspicious_document_segments} = @{retrieve($$file_paths_setting{'dumped_ngrams_file'})} }
    elsif ( $ngrams_source eq 'ngramer_only')
    {	string_list_character_ngramer($suspicious_document_segments, $n);	}	
    
    
    ############# Build the document model: a vector where eash element is an ngram with its normalised frequency over sentences
    my %document_model = document_ngramFrequencyOverSentences_log_model($suspicious_document_segments,$number_of_class, 'once_per_segment'); #the second argument is the number of classes
    #dump %document_model;
    #print "\n";
    
    ############# Discard very short segments
    #my $length_to_discard = 4;# in words
    #discard_short_segment(\@suspicious_document_segments,$length_to_discard);
    #print " end discard very short segments";
    
    ############# Build a model for each segment:
    
    string_list_from_ngram_model_to_class_model($suspicious_document_segments, \%document_model);
    #dump $suspicious_document_segments;
    #print "\n";

}


sub string_character_ngramer # ok just I modify the lowercase configuration in ngram_counts and make it flexible in encoded data
{      
    use strict;
    use warnings;
    use Text::Ngram qw(ngram_counts add_to_counts);
    use Encode;
    my $string = $_[0];
    my $n = $_[1];
    $string =~ s/[0-9]//g;
    #print $string,"\n";
    unless (Encode::is_utf8($string))  ### if the text is in byte format decode it to convert it to utf8 string 
    {   $string = decode('utf-8', $string, Encode::FB_WARN);
	#print "the text is converted to utf\n";
    }#decode in UTF if the text is in byte format
    
    my $href = ngram_counts({flankbreaks => 0, lowercase => 1, punctuation => 1,spaces => 1},$string, $n); #lowercase If set to 1, all letters are lowercased before counting ngrams. 
    #print "my ngrams are :\n";
    #dump $href;
    return %{$href};   # ngrams are decoded in utf8
}

sub string_list_character_ngramer # ok no modification 
{   ############### OUTPUT and INPUT of this sub routine are decoded to utf8
    use strict;
    use warnings;
    my $string_list = $_[0];
    my $n = $_[1];
    foreach my $element (@{$string_list})
    {   my %ngrams = string_character_ngramer($$element{'segment'},$n);
        $$element{'ngrams'} = {%ngrams};
	delete $$element{'segment'};
    }
    
    #return @string_list;
}


sub document_ngramFrequencyOverSentences_log_model  # ok 
{   ##### INPUT: 1)an array where each element is a hash composed of 3 keys :
    ##### 'segment': text
    ##### 'ngrams' : a hash with ngrams extracted from 'segment' as keys and their values is the number of occurence in the segment
    ##### 2)the number of classes in which we would like to classify the document ngrams
    ##### 3) the frequency mode : it can take one of the two values :
    #####  'full' the number of occurence is obtained by counting the ngram in all the document with taking into acount repetitions per each segments
    #####  'once_per_segment': the number of occurence is obtained by counting the ngram once per segment
    ##### OUTPUT: a hash where each key is an ngram and value is a hash composed of 2 keys : 'nb_occurence' and 'class'
    my $start = time;

    use strict;
    use warnings;
    use List::Util qw(max);

    my $segments_list = $_[0];
    my $number_of_classes = $_[1];
    my $frequency_mode = $_[2];
    
    my %document_ngram_hash;
    my $number_of_segments;
    #print @segments_ngram;
    
    if ($frequency_mode eq 'full')
    {	foreach my $segment (@{$segments_list})
	{   $number_of_segments++;
	    #my %segment_ngrams_hash = %{$$segment{'ngrams'}};
	    foreach my $ngram (keys %{$$segment{'ngrams'}})
	    {
		$document_ngram_hash{$ngram}+= $$segment{'ngrams'}{$ngram};### number of occurence of the ngram in the document segments. If segments are overlaping the obtained number of occurence of each ngram will be greater than its real number of occunce in in the document
	    }  
	}
    }
    
    if ($frequency_mode eq 'once_per_segment')
    {	foreach my $segment (@{$segments_list})
	{   $number_of_segments++;
	    #my %segment_ngrams_hash = %{$$segment{'ngrams'}};
	    foreach my $ngram (keys %{$$segment{'ngrams'}})
	    {
		$document_ngram_hash{$ngram}++;### number of occurence of the ngram in the document , but counted once per segment
	    }  
	}
    }
    
    my $max_occurence = max values %document_ngram_hash; # get the maximum occurence
    if ($max_occurence == 1) ### all the ngrams of a document appears only one time
    {   foreach my $ngram (keys %document_ngram_hash) 
        {
            my %ngram_info;
            #$ngram_info{'nb_occurence'} = 1;   #### number of occurence
            $ngram_info{'class'}  = 0;   ### all ngram are in class 0 (the class of the less frequent)
            $document_ngram_hash{$ngram}= {%ngram_info};
        }   
    }    
    else          
    {   $max_occurence = $max_occurence**(1/($number_of_classes-1)); # the $number_of_classes-1 root of the max_occurence
        foreach my $ngram (keys %document_ngram_hash) 
        {
            my %ngram_info;
            #$ngram_info{'nb_occurence'} = $document_ngram_hash{$ngram};  # number of occurence of the ngram in sentences (only one occurence is counted per segmente)
            $ngram_info{'class'}  = sprintf ( "%0.0f", log_value_in_base($document_ngram_hash{$ngram},$max_occurence) ); # compute the ngram frequency class 
            $document_ngram_hash{$ngram}= {%ngram_info};
        }
    }
    #my $nb_ngrams = keys  %document_ngram_hash;
    #print $nb_ngrams;
    return %document_ngram_hash;
}

sub log_value_in_base  #### from Perl Cookbook ###OK no modification
{  
    my ($value, $base) = @_;
    return log($value)/log($base);
}

sub discard_short_segment # Modification in this version in comparison with InAra version : I chnaged inara pagckage by pan pakhage, the tokenisation in this last package workes also for arabic and it is the last version 
{   ###### this function do not consider an acronym as one word but it will be spllited to many separate letters each of which
    ###### is considered a word   
    use strict;
    use warnings;
    use PAN2011CorpusLib::PANTextSegmentation;
    my $list_of_segments = $_[0]; # adress of an array of hashes that contains decoded segments
    my $length_of_segment_to_discard = $_[1];
    my $segment_index = $#$list_of_segments;
    #print "Total number of segments before deletion ", $#$list_of_segments + 1;
    my $deleted_segment = 0;
    while ($segment_index >= 0) 
    {   if (  number_of_words_in_one_segment($$list_of_segments[$segment_index]{'segment'}) < 4 ) #decoded string
        {   #print "Deleted",Dumper $$list_of_segments[$segment_index],"\n";
            splice @{$list_of_segments},$segment_index,1;   #delete segment whose number of words <   4
            $deleted_segment++;  
        }   
        $segment_index--;    
    }
    #print "Number of deleted segment: $deleted_segment ", "the rest : ",$#$list_of_segments + 1,"\n" ;
}

sub segment_Ngram_DocFrequency_model ## OK no modification
{   #my $start = time;

    use strict;
    use warnings;
    my $segment_ngrams = $_[0];
    my $document_ngrams = $_[1]; # adress of a hash (the document model)
    #print @segments_ngram;
    foreach my $n_gram (keys %{$segment_ngrams})
    {   my %ngram_info;
        $ngram_info{'nb_occ_in_seg'} = $$segment_ngrams{$n_gram}; #save the ngram number of occurence in the string segment
        #$ngram_info[1] = sprintf("%0.1f",$$document_ngrams{$n_gram}[1]); #save in the second element of an array the ngram normalized frequency in the document
        $ngram_info{'class'} = $$document_ngrams{$n_gram}{'class'}; # the ngram frequency range
        $$segment_ngrams{$n_gram} = {%ngram_info};    
    }
    
}

sub set_of_segment_Ngram_DocFrequency_model  # ok no modification
{   use strict;
    use warnings;
    my $segments_list_with_ngrams = $_[0];
    my $document_model = $_[1]; # adress of the document model
    
    foreach my $segment (@{$segments_list_with_ngrams})
    {   
        segment_Ngram_DocFrequency_model($$segment{'ngrams'},$document_model ); # tag nagrams with their normalized frequences in the document
    }
    
}



sub log_freqency_rang  ##### no call to this function in the file
{   my $frequency = $_[0];
    my $nbr_of_ranges = $_[1];
    my %freq_hash;
    for (my $i = 1; $i < $nbr_of_ranges; $i++)
    { $freq_hash{$i} = $i;
	
    }
    #my %freq_hash = ( 0 =>  'infrequent', 1 => 'less_frequent', 2 => 'frequent', 3 => 'most_frequent');
    return $freq_hash{$frequency};
}


sub string_ClassNgrams_model # OK I think no modification is needed not sure about the encoding of ngram I should chenck when runing
{   ##### input : a pointer to a hash where keys are ngrams of a string and valuses are a hash of 2 keys 'class' and 'nb_occ_in_seg'
    ##### output: a hash where keys are classes and values are hashes compsed of ngrams as keys and threre nb of occurence in the string as vlaues
    use strict;
    use warnings;
    my $string_ngramFrequency_model = $_[0];  # pointer to a hash
    my %FrequencyNgrams_segment_model;
    foreach my $ngram (keys %{$string_ngramFrequency_model})
    {   my $ngram_class = $$string_ngramFrequency_model{$ngram}{'class'};
        $FrequencyNgrams_segment_model{$ngram_class}{$ngram} = $$string_ngramFrequency_model{$ngram}{'nb_occ_in_seg'};  
    }
    return %FrequencyNgrams_segment_model;
}


sub string_ClassFrequency_model # OK I think no modification is needed not sure about the encoding of ngram I should chenck when runing
{   my $start = time;

    use strict;
    use warnings;
    my $segment = $_[0];
    my $ngram_nbr_in_segment = 0;
    ####### compute the number of ngrams in each class
    foreach my $cls  (keys %{$segment})
    {   my $ngrams_nbr_in_class=0;
        foreach my $ngram (keys %{$$segment{$cls}}) # the hash keys are ngrams.Values are ngram nb of occurence in the segment
        {   $ngrams_nbr_in_class += $$segment{$cls}{$ngram};     } # total number of ngrams in the class (repetition of ngrams are counted)
        $$segment{$cls} = $ngrams_nbr_in_class;
        $ngram_nbr_in_segment += $ngrams_nbr_in_class;
    }
    ####### replace the number of ngrams in each class per a normalized value over the total number of ngrams
    foreach my $cls   (keys %{$segment})
    {   $$segment{$cls} = $$segment{$cls}/$ngram_nbr_in_segment * 100;
        $$segment{$cls} =sprintf("%0.2f",$$segment{$cls});
    }
    
    my $duration = time-$start;
    my $duration_min = sprintf("%d",$duration/60);
    #print "\nRunning duration segment_DocFrequency_NgramNumber_model($duration sec):\t",$duration_min," min ",$duration % 60," sec","\n";   

}


sub print_hashs_list_in_file  #OK
{   my $start = time;

    use strict;
    use warnings;
    use FileOperation::PathsAndNames; #  for get_file_name_from_path
    
    my @hashs_list = @{$_[0]};  ### list of hashs which have the same keys
    my @keys_to_print = @{$_[1]};  ### values of similar keys will be printed in the same column
    my $file_path = $_[2];
    my $file_name_whose_the_printed_model = $_[3];
    my $separator = ','; # or ';'
    $file_name_whose_the_printed_model = get_file_name_from_path($file_name_whose_the_printed_model,'file-number');
    $file_name_whose_the_printed_model= sprintf("%d",$file_name_whose_the_printed_model);
    open my $training_set,">>",$file_path or die $!;
    foreach my $segment (@hashs_list)
    {   print $training_set $file_name_whose_the_printed_model,$separator;
        foreach my $features (@keys_to_print)
        {   if ($$features[0] eq 1)
            {   my $feature_to_print = $$features[1];
                if (exists $$segment{$feature_to_print})
                {   print $training_set $$segment{$feature_to_print},$separator;}
                else
                {   print $training_set 0,$separator;}
            }
            else
            {   my $parent_feature = $$features[0];
                my $subfeature_to_print = $$features[1];
              
                if (exists $$segment{$parent_feature}{$subfeature_to_print})
                {   print $training_set $$segment{$parent_feature}{$subfeature_to_print},$separator;}
                else
                {   print $training_set 0,$separator;}     
            }
        }
        print $training_set "\n";
    }
    close $training_set;
    my $duration = time-$start;
    my $duration_min = sprintf("%d",$duration/60);
    #print "\nRunning duration print_hashs_list_in_file($duration sec):\t",$duration_min," min ",$duration % 60," sec","\n";   
}

sub create_arff_file_head  #Ok
{
    use strict;
    use warnings;
    
    my $features_file_path  = $_[0];
    my $feature_list = $_[1];
    
    open my $training_set,">",$features_file_path or die $features_file_path ,$!;
    print $training_set '@relation ',$features_file_path,"\n";
    print $training_set '@attribute ',"document NUMERIC\n";
    
    foreach my $feature_name ( @{$feature_list} )
    { 
      print $training_set '@attribute ',$$feature_name[1],' ',$$feature_name[2],"\n";  # e.g. @attribute 4 NUMERIC
    }
    print $training_set '@data',"\n";
    close $training_set;   
    
}


sub string_list_from_ngram_model_to_class_model
{
    my $suspicious_document_segments = $_[0];
    my $document_model = $_[1];
    foreach my $segment (@{$suspicious_document_segments})
    {   
        segment_Ngram_DocFrequency_model($$segment{'ngrams'},$document_model);
	my %segment_ngram_frquency = string_ClassNgrams_model($$segment{'ngrams'});
	delete $$segment{'ngrams'}; # to empty memory space
        string_ClassFrequency_model(\%segment_ngram_frquency);
        $$segment{'style_model'}= {%segment_ngram_frquency};
    }
}