use strict;
use warnings;
use File::Find;

# Specify the folder and file extension
my $folder      = '.';        # Change to the desired folder
my $file_ext    = 'trm';      # Change to the desired file extension
my $output_file = 'output.txt';  # Change to the desired output file name

# Specify patterns to exclude
my @excludePatterns = qw([info]);

# Hash to store counts for each unique base line within square brackets
my %baseCounts;
my %firstOccurrence;
my %lastOccurrence;

# Find all files with the specified extension in the folder
find(\&process_file, $folder);

# Open the output file for writing
open my $output_handle, '>', $output_file or die "Cannot open output file $output_file: $!";

# Print the results to the output file, excluding [INFO]
foreach my $baseLine (keys %baseCounts) {
    next if lc($baseLine) eq lc("[INFO]");  # Skip printing for [INFO]
    
    my $count = $baseCounts{$baseLine};
    my $firstDigit = $firstOccurrence{$baseLine} // 'N/A';
    my $lastDigit  = $lastOccurrence{$baseLine}  // 'N/A';

    # Print to both the console and the output file
    print "[$baseLine] starts at $firstDigit and ends at $lastDigit, appears $count times.\n";
    print $output_handle "$baseLine,$firstDigit,$lastDigit,$count\n";
}

# Close the output file
close $output_handle;

sub process_file {
    return unless -f $_ && /\.($file_ext)$/i;

    my $file_path = $File::Find::name;

    open my $file_handle, '<', $file_path or die "Cannot open file $file_path: $!";
    while (my $file_content = <$file_handle>) {
        while ($file_content =~ /\[([^\d]+)(\d+)\]/g) {
            my $baseLine = $1;
            my $endDigits = $2;

            # Skip patterns specified in @excludePatterns
            next if grep { lc($_) eq lc($baseLine) } @excludePatterns;

            # Update first and last occurrences of the numeric part
            $firstOccurrence{$baseLine} //= $endDigits;
            $lastOccurrence{$baseLine} = $endDigits;

            # Update counts for each unique non-numeric part, excluding [INFO]
            next if lc($baseLine) eq lc("[INFO]");
            $baseCounts{$baseLine}++;
        }
    }
    close $file_handle;
}
