#!/usr/bin/env perl

use strict;
use warnings;

use Data::Dumper;
use File::Path qw(make_path);
use File::Spec;
use Getopt::Long;
use FindBin;
use lib "$FindBin::Bin/../lib";
use Pod::Usage;

use App::Test::Generator::SchemaExtractor;

=head1 NAME

extract-schemas - Extract test schemas from Perl modules

=head1 SYNOPSIS

    extract-schemas [options] <module.pm>

    Options:
      --output-dir DIR    Output directory for schema files (default: schemas/)
      --strict-pod=off|warn|fatal
      --verbose           Show detailed analysis
      --fuzz              Run coverage-guided fuzzing on extracted schemas
      --fuzz-iters N      Iterations per method when fuzzing (default: 100)
                          (no short form, to avoid conflict with --fuzz/-f)
      --fuzz-all          Fuzz all methods, including those with no input schema
      --corpus-dir DIR    Directory to persist fuzz corpora (default: schemas/corpus/)
      --help              Show this help message
      --man               Show full documentation

    Examples:
      extract-schemas lib/MyModule.pm
      extract-schemas --output-dir my_schemas --verbose lib/MyModule.pm
      extract-schemas --fuzz lib/MyModule.pm
      extract-schemas --fuzz --fuzz-iters 300 --corpus-dir t/corpus lib/MyModule.pm
      extract-schemas --fuzz --fuzz-all lib/MyModule.pm

=head1 QUICK START

Run C<extract-schemas --strict-pod=warn -v --fuzz lib/MyModule.pm> to analyse your module and
automatically probe each method with hundreds of fuzzed inputs,
looking for
crashes caused by inputs that should be valid.
Anything suspicious is saved to C<schemas/corpus/>.

If genuine bugs are found,
run C<fuzz-harness-generator --replay-corpus schemas/corpus/ -o t/fuzz_replay.t>
to turn them into regression tests that will fail until you fix the underlying code and pass forever after.
Run C<extract-schemas --fuzz> regularly - each
run builds on the last, probing deeper into your code each time.

Otherwise, for each of the functions in MyModule.pm,
C<fuzz-harness-generator -r schemas/function.yml>

=head1 DESCRIPTION

This tool analyzes a Perl module and generates YAML schema files for each
method, suitable for use with L<App::Test::Generator>
using the C<fuzz-harness-generator> program which will create the C<.t> file to run through C<prove>.

The extractor uses three sources of information:

=over 4

=item 1. POD Documentation

Parses parameter descriptions from POD to extract types and constraints.

=item 2. Code Analysis

Analyzes validation patterns in the code (ref checks, length checks, etc.)

=item 3. Method Signatures

Extracts parameter names from method signatures.

=back

The tool assigns a confidence level (high/medium/low) to each schema based
on how much information it could infer.

=head1 FUZZING

When C<--fuzz> is specified, the tool will additionally run
C<App::Test::Generator::CoverageGuidedFuzzer> against each method after
schema extraction.

By default all methods with at least one known input parameter are fuzzed,
regardless of confidence level. Use C<--fuzz-all> to also attempt fuzzing
methods with no input schema (these will use purely random generation).

The fuzzer will:

=over 4

=item * Load and C<require> the target module at runtime

=item * Run coverage-guided fuzzing using the extracted schema as input spec

=item * Report any crashes or unexpected errors found

=item * Persist a corpus to C<--corpus-dir> for incremental improvement across runs

=back

Corpus files are named C<< <corpus-dir>/<method>.json >> and are automatically
loaded on subsequent runs, so each run builds on the last.

=cut

# ---------------------------------------------------------------------------
# Option parsing
# ---------------------------------------------------------------------------

my %cli_opts = (
    help => 0,
    man  => 0,
);

my %extractor_opts = (
    output_dir => 'schemas',
    strict_pod => 'warn',
    verbose    => 0,
);

my $fuzz       = 0;
my $fuzz_all   = 0;
my $fuzz_iters = 100;
my $corpus_dir;   # default set after output_dir is known

GetOptions(
    'output-dir|o=s'  => \$extractor_opts{output_dir},
    'strict-pod|s=s'  => \$extractor_opts{strict_pod},
    'verbose|v'       => \$extractor_opts{verbose},
    'fuzz|f'          => \$fuzz,
    'fuzz-all'        => \$fuzz_all,
    'fuzz-iters=i'    => \$fuzz_iters,
    'corpus-dir|c=s'  => \$corpus_dir,
    'help|h'          => \$cli_opts{help},
    'man|m'           => \$cli_opts{man},
) or pod2usage(2);

pod2usage(-exitval => 0, -verbose => 1) if $cli_opts{help};
pod2usage(-exitval => 0, -verbose => 2) if $cli_opts{man};

if ($extractor_opts{strict_pod} !~ /^(off|warn|fatal)$/) {
	die "Invalid --strict-pod value '$extractor_opts{strict_pod}'. Expected off, warn, or fatal";
}

my $input_file = shift @ARGV or pod2usage('Error: No input file specified');
die "Error: File not found: $input_file" unless -f $input_file;

# Default corpus dir sits under the output dir
$corpus_dir //= File::Spec->catdir($extractor_opts{output_dir}, 'corpus');

# ---------------------------------------------------------------------------
# Schema extraction
# ---------------------------------------------------------------------------

print "Extracting schemas from: $input_file\n";
print "Output directory: $extractor_opts{output_dir}\n\n";

make_path($extractor_opts{output_dir}) unless -d $extractor_opts{output_dir};

my $extractor = App::Test::Generator::SchemaExtractor->new(
    input_file => $input_file,
    %extractor_opts,
);

my $schemas = $extractor->extract_all();

# ---------------------------------------------------------------------------
# Optional: coverage-guided fuzzing
# ---------------------------------------------------------------------------

my %fuzz_results;   # method_name => report hashref

if ($fuzz) {
    require App::Test::Generator::CoverageGuidedFuzzer;
    make_path($corpus_dir) unless -d $corpus_dir;

    # Load the target module once so all methods are callable
    my $package = _load_target_module($input_file, $schemas);

    # Try to build a default instance for object method calls.
    # Most OO modules need a $self as the first argument.
    # We try new() with no args, then new({}), then give up and fuzz as functions.
    my $instance = _try_construct($package);
    if ($instance) {
        print "Constructed $package instance for method calls.\n";
    } else {
        print "Could not construct $package instance; fuzzing as functions.\n";
    }

    print "Fuzzing with $fuzz_iters iterations per method",
          ($fuzz_all ? ' (all methods)' : ' (methods with known inputs)'),
          "...\n\n";

    foreach my $method (sort keys %$schemas) {
        my $schema = $schemas->{$method};
        my $iconf  = $schema->{_confidence}{input}{level} // 'low';

        unless ($fuzz_all) {
            # Skip methods with no input schema at all — there is nothing to fuzz
            next if $iconf eq 'none' && !%{ $schema->{input} // {} };
        }

        my $sub_ref = $package->can($method);
        unless ($sub_ref) {
            warn "  Skipping $method: not callable in $package\n";
            next;
        }

        # Skip constructors and AUTOLOAD — not suitable for direct fuzzing
        if ($method =~ /^(new|AUTOLOAD|DESTROY|import)$/) {
            print "  Skipping $method (constructor/special method)\n"
                if $extractor_opts{verbose};
            next;
        }

        my $corpus_file = File::Spec->catfile($corpus_dir, "$method.json");

        print "  Fuzzing $method ($iconf confidence)... ";

        my $fuzzer = App::Test::Generator::CoverageGuidedFuzzer->new(
            schema      => $schema,
            target_sub  => $sub_ref,
            instance    => $instance,
            iterations  => $fuzz_iters,
        );

        $fuzzer->load_corpus($corpus_file) if -f $corpus_file;

        my $report = $fuzzer->run();
        $fuzzer->save_corpus($corpus_file);

        $fuzz_results{$method} = $report;

        printf "%d bugs, %d branches covered\n",
            $report->{bugs_found},
            $report->{branches_covered};
    }

    print "\n";
}

# ---------------------------------------------------------------------------
# Summary report
# ---------------------------------------------------------------------------

print '=' x 70, "\n",
      "EXTRACTION SUMMARY\n",
      '=' x 70, "\n\n";

my %input_confidence_counts  = (high => 0, medium => 0, low => 0, none => 0);
my %output_confidence_counts = (high => 0, medium => 0, low => 0, none => 0);

foreach my $method (sort keys %$schemas) {
    my $schema = $schemas->{$method};
    my $iconf  = $schema->{_confidence}{input}{level}  // 'low';
    my $oconf  = $schema->{_confidence}{output}{level} // 'low';
    $input_confidence_counts{$iconf}++;
    $output_confidence_counts{$oconf}++;

    my $param_count = scalar grep { $_ !~ /^_/ } keys %{ $schema->{input} };

    my $fuzz_col = '';
    if (exists $fuzz_results{$method}) {
        my $r = $fuzz_results{$method};
        $fuzz_col = $r->{bugs_found}
            ? sprintf('  BUGS: %d', $r->{bugs_found})
            : '  fuzz: ok';
    }

    printf "%-30s %d params  [%s input confidence] [%s output confidence]%s\n",
        $method, $param_count, uc($iconf), uc($oconf), $fuzz_col;
}

print "\n";
print 'Total methods: ', (scalar keys %$schemas), "\n";
print "  Input:\n";
print "    High confidence:   $input_confidence_counts{high}\n";
print "    Medium confidence: $input_confidence_counts{medium}\n";
print "    Low confidence:    $input_confidence_counts{low}\n";
print "  Output:\n";
print "    High confidence:   $output_confidence_counts{high}\n";
print "    Medium confidence: $output_confidence_counts{medium}\n";
print "    Low confidence:    $output_confidence_counts{low}\n";
print "\n";

if ($input_confidence_counts{low} > 0 || $input_confidence_counts{medium} > 0) {
    print "RECOMMENDATION:\n",
          "Review the generated schemas in $extractor_opts{output_dir}/\n",
          "Focus on methods with medium/low confidence ratings.\n\n";
}

# Fuzz bug detail
if (%fuzz_results) {
    my $total_bugs = 0;
    $total_bugs += $_->{bugs_found} for values %fuzz_results;

    if ($total_bugs) {
        print '=' x 70, "\n",
              "FUZZING BUGS FOUND ($total_bugs total)\n",
              '=' x 70, "\n\n";

        foreach my $method (sort keys %fuzz_results) {
            my $r = $fuzz_results{$method};
            next unless $r->{bugs_found};
            print "  $method:\n";
            for my $i (0 .. $#{ $r->{bugs} }) {
                my $bug = $r->{bugs}[$i];
                my $inp = defined($bug->{input}) ? qq("$bug->{input}") : 'undef';
                printf "    Bug %d: input=%-30s error=%s\n",
                    $i + 1, $inp, $bug->{error};
            }
            print "\n";
        }
        print "Corpora saved to: $corpus_dir/\n\n";
    } else {
        print "Fuzzing complete: no bugs found across ",
              scalar(keys %fuzz_results), " methods.\n\n";
    }
}

if ($extractor_opts{verbose}) {
    print "Schemas:\n\t", Dumper($schemas);
}

print "Schema files written to: $extractor_opts{output_dir}/\n";

# ---------------------------------------------------------------------------
# Helper: load the target module so methods become callable
# ---------------------------------------------------------------------------

sub _load_target_module {
    my ($input_file, $schemas) = @_;

    # Derive the package name from the first schema entry that has 'module' set
    my ($package) = map  { $schemas->{$_}{module} }
                    grep { $schemas->{$_}{module} }
                    keys %$schemas;

    die 'Could not determine package name from extracted schemas' unless $package;

    # Add the module's containing lib dir to @INC
    # Walks up from the file looking for a 'lib' directory
    my $abs = File::Spec->rel2abs($input_file);
    my @dirs = File::Spec->splitdir( (File::Spec->splitpath($abs))[1] );

    while (@dirs) {
        my $candidate = File::Spec->catdir(@dirs, 'lib');
        if (-d $candidate) {
            lib->import($candidate);
            last;
        }
        pop @dirs;
    }

    eval "require $package"
        or die "Could not load $package for fuzzing: $@";

    return $package;
}

# Try to construct a default instance of the target package for method calls.
# Attempts new() with progressively more forgiving argument lists.
# Returns the instance on success, undef if nothing works.
sub _try_construct {
    my ($package) = @_;

    for my $args ([], [{}], [undef]) {
        my $obj = eval { $package->new(@$args) };
        next if $@;
        next unless defined $obj && ref $obj;
        return $obj;
    }

    return undef;
}

__END__

=head1 SCHEMA FORMAT

The generated YAML files have the following structure:

    method: method_name
    confidence: high|medium|low
    notes:
      - Any warnings or suggestions
    input:
      param_name:
        type: string|integer|number|boolean|arrayref|hashref|object
        min: 5
        max: 100
        optional: 0
        matches: /pattern/

=head1 CONFIDENCE LEVELS

=over 4

=item B<high>

Strong evidence from POD and code analysis. Schema should be accurate.

=item B<medium>

Partial information available. Review recommended.

=item B<low>

Limited information. Manual review required.

=back

=head1 EXAMPLES

=head2 Basic Usage

    extract-schemas lib/MyModule.pm

=head2 Fuzz methods with known inputs

    extract-schemas --fuzz lib/MyModule.pm

=head2 Fuzz everything, 300 iterations, custom corpus dir

    extract-schemas --fuzz --fuzz-all --fuzz-iters 300 --corpus-dir t/corpus lib/MyModule.pm

=head2 Incremental fuzzing (corpus grows across runs)

    # First run: builds initial corpus
    extract-schemas --fuzz lib/MyModule.pm

    # Subsequent runs: loads corpus and extends it
    extract-schemas --fuzz lib/MyModule.pm

=head2 Verbose Mode

    extract-schemas --verbose lib/MyModule.pm

=head2 Pod Checking

  --strict-pod=LEVEL
    off    - do not validate POD
    warn   - warn on mismatches (default)
    fatal  - abort on mismatches

=head1 NEXT STEPS

After extracting schemas:

1. Review the generated YAML files, especially those marked low confidence
2. Edit the schemas to add missing information or correct errors
3. Use the schemas with App::Test::Generator:

    test-generator --schema schemas/my_method.yaml

=head1 SEE ALSO

L<App::Test::Generator>, L<App::Test::Generator::CoverageGuidedFuzzer>,
L<PPI>, L<Pod::Simple>

=head1 AUTHOR

Nigel Horne

=cut
