#!/usr/bin/perl  -I/home/phil/perl/cpan/DataEditXml/lib/ -I/home/phil/perl/cpan/DataTableText/lib/
#-------------------------------------------------------------------------------
# Cross reference Dita XML, match topics and ameliorate missing references.
# Philip R Brenan at gmail dot com, Appa Apps Ltd Inc, 2016-2018
#-------------------------------------------------------------------------------
# It is easier to criticize than to fix.
# Details of reports in documentation
# podDocumentation

package Data::Edit::Xml::Xref;
our $VERSION = 20190131;
use v5.20;
use warnings FATAL => qw(all);
use strict;
use Carp qw(confess cluck);
use Data::Dump qw(dump);
use Data::Edit::Xml;
use Data::Table::Text qw(:all);
use utf8;

#D1 Cross reference                                                             # Check the cross references in a set of Dita files and report the results.

sub improvementLength {80}                                                      #P Improvement length

sub lll(@)                                                                      #P Write a message
 {my (@m) = @_;                                                                 # Message text
  say STDERR timeStamp, join "", " Xref: ", @_;
 }

sub xref(%)                                                                     # Check the cross references in a set of Dita files held in  L<inputFolder|/inputFolder> and report the results in the L<reports|/reports> folder. The possible attributes are defined in L<Data::Edit::Xml::Xref|/Data::Edit::Xml::Xref>
 {my (%attributes) = @_;                                                        # Attributes
  my $xref = genHash(__PACKAGE__,                                               # Attributes used by the Xref cross referencer.
    attributeCount=>{},                                                         # {file}{attribute} == count of the different xml attributes found in the xml files.
    author=>{},                                                                 # {file} = author of this file.
    badBookMaps=>{},                                                            # Bad book maps.
    badConRefs=>{},                                                             # {sourceFile} = [file, href] indicating the file has at least one bad conref.
    badConRefsList=>{},                                                         # Bad conrefs - by file.
    badGuidHrefs=>{},                                                           # Bad conrefs - all.
    badImageRefs=>{},                                                           # Consolidated images missing.
    badTables=>[],                                                              # Array of tables that need fixing.
    badTopicRefs=>{},                                                           # [file, href]   Invalid href attributes found on topicref tags.
    badXml1=>{},                                                                # [Files] with a bad xml encoding header on the first line.
    badXml2=>{},                                                                # [Files] with a bad xml doc type on the second line.
    badXRefs=>{},                                                               # Bad Xrefs - by file
    badXRefsList=>{},                                                           # Bad Xrefs - all
    conRefs=>{},                                                                # {file}{href}   Count of conref definitions in each file.
    docType=>{},                                                                # {file} == docType:  the docType for each xml file.
    duplicateIds=>{},                                                           # [file, id]     Duplicate id definitions within each file.
    duplicateTopicIds=>{},                                                      # Duplicate topic ids
    duplicateTopicIds=>{},                                                      # [topicId, [files]] Files with duplicate topic ids - the id on the outermost tag.
    fileExtensions=>[qw(.dita .ditamap .xml .fodt)],                            # Default file extensions to load
    fixBadRefs=>undef,                                                          #I Try to fix bad references in L<these files|/fixRefs> where possible by either changing a guid to a file name assuming the right file is present or failing that by moving the failing reference to the "xtrf" attribute.
    fixRefs=>{},                                                                # {file}{ref} where the href or conref target is not present.
    fixedRefs=>[],                                                              # [] hrefs and conrefs from L<fixRefs|/fixRefs which have been ameliorated where possible by either changing a guid to a file name assuming the right file is present or failing that by moving the failing reference to the "xtrf" attribute.
    goodBookMaps=>{},                                                           # Good book maps.
    goodConRefs=>{},                                                            # Good con refs - by file.
    goodConRefsList=>{},                                                        # Good con refs - all.
    goodGuidHrefs=>{},                                                          # {file}{href}{location}++ where a href that starts with GUID- has been correctly resolved.
    goodImageRefs=>{},                                                          # Consolidated images found.
    goodTopicRefs=>{},                                                          # Good topic refs.
    goodXRefs=>{},                                                              # Good xrefs - by file.
    goodXRefsList=>{},                                                          # Good xrefs - all.
    guidHrefs=>{},                                                              # {file}{href} = location where href starts with GUID- and is thus probably a guid.
    guidToFile=>{},                                                             # {topic id which is a guid} = file defining topic id.
    ids=>{},                                                                    # {file}{id}     Id definitions across all files.
    images=>{},                                                                 # {file}{href}   Count of image references in each file.
    improvements=>{},                                                           # Suggested improvements - a list of improvements that might be made.
    inputFiles=>[],                                                             # Input files from L<inputFolder|/inputFolder>.
    inputFolder=>undef,                                                         #I A folder containing the dita and ditamap files to be cross referenced.
    inputFolderImages=>{},                                                      # {filename} = full file name which works well for images because the md5 sum in their name is probably unique.
    maximumNumberOfProcesses=>4,                                                #I Maximum number of processes to run in parallel at any one time.
    md5Sum=>{},                                                                 # MD5 sum for each input file.
    missingImageFiles=>{},                                                      # [file, href] == Missing images in each file.
    missingTopicIds=>{},                                                        # Missing topic ids.
    noHref=>{},                                                                 # Tags that should have an href but do not have one.
    notReferenced=>{},                                                          # Files in input area that are not referenced by a conref, image, topicref or xref tag and are not a bookmap.
    parseFailed=>{},                                                            # [file] files that failed to parse.
    reports=>q(reports),                                                        #I Reports folder: the cross referencer will write reports to files in this folder.
    results=>[],                                                                # Summary of results table.
    sourceFile=>undef,                                                          # The source file from which this structure was generated.
    statusLine=>undef,                                                          # Status line summarizing the cross reference.
    statusTable=>undef,                                                         # Status table summarizing the cross reference.
    summary=>1,                                                                 #I Print the summary line.
    tagCount=>{},                                                               # {file}{tags} == count of the different tag names found in the xml files.
    title=>{},                                                                  # {file} = title of file.
    topicIds=>{},                                                               # {file} = topic id - the id on the outermost tag.
    topicRefs=>{},                                                              # {file}{href}++ References from bookmaps to topics via appendix, chapter, topicref.
    validationErrors=>{},                                                       # True means that Lint detected errors in the xml contained in the file.
    vocabulary=>{},                                                             # The text of each topic shorn of attributes for vocabulary comparison.
    xRefs=>{},                                                                  # {file}{href}++ Xrefs references.
    xrefBadScope=>{},                                                           # External xrefs with no scope=external.
    xrefBadFormat=>{},                                                          # External xrefs with no format=html.
    relativePath=>undef,                                                        #I Report files relative to this path or absolutely if undefined.
    matchTopics=>undef,                                                         #I Match topics by title and by vocabulary to the specified confidence level between 0 and 1.  This operation might take some time to complete on a large corpus.
   );

  loadHash($xref, @_);                                                          # Load attributes complaining about any invalid ones

  $xref->inputFolder or confess "Please supply a value for: inputFolder";
  $xref->inputFolder =~ s(\/*\Z) (\/)s;                                         # Cleanup path names
  $xref->inputFolder = absFromAbsPlusRel(currentDirectory, $xref->inputFolder)  # Make input folder absolute
    if $xref->inputFolder !~ m(\A/);

  if (my $r = $xref->relativePath)                                              # Make relativePath absolute
   {$r =~ s(/*\Z) (/)s;
    if ($r !~ m(\A/\Z)s)
     {$r = absFromAbsPlusRel(currentDirectory, $r);
     }
    $xref->relativePath = $r;
   }

  my @phases = (q(loadInputFiles),                                              # All non topic matching reports
                q(analyze),
                q(reportXml1),
                q(reportXml2),
                q(reportDuplicateIds),
                q(reportDuplicateTopicIds),
                q(reportNoHrefs),
                q(reportXrefs),
                q(reportTopicRefs),
                q(reportTables),
                q(reportConrefs),
                q(reportImages),
                q(reportParseFailed),
                q(reportAttributeCount),
                q(reportTagCount),
                q(reportDocTypeCount),
                q(reportFileExtensionCount),
                q(reportFileTypes),
                q(reportValidationErrors),
                q(reportBookMaps),
                q(reportGuidHrefs),
                q(reportGuidsToFiles),
                q(reportNotReferenced),
                q(reportExternalXrefs),
                q(reportPossibleImprovements),
                q(reportTopicDetails),
                q(reportTopicReuse),
                q(reportMd5Sum),
               );

  if ($xref->matchTopics)                                                       # Topic matching reports
   {push @phases, q(reportSimilarTopicsByTitle),
                  q(reportSimilarTopicsByVocabulary);
   }

  for my $phase(@phases)                                                        # Perform analysis phases
   {#lll "Phase: $phase";
    $xref->$phase;
   }

  if ($xref->fixBadRefs)                                                        # Fix files if requested
   {#lll "Phase: fixFiles";
    $xref->fixFiles;
    #lll "Phase: fixFiles end";
   }

  formattedTablesReport
   (title=>q(Reports available),
    head=><<END,
NNNN reports available on DDDD

Sorted by title
END
   file=>fpe($xref->reports, qw(reports txt)));

  if (1)                                                                        # Summarize
   {my @o;
    my $save = sub
     {my ($levels, $field, $plural, $single) = @_;
      my $n = &countLevels($levels, $xref->{$field});
      push @o, [$n,            $plural]                   if $n >  1;
      push @o, [$n, $single // ($plural =~ s(s\Z) ()gsr)] if $n == 1;
     };

    $save->(1, "badBookMaps",       q(bad book maps));                          # Status line components
    $save->(1, "badConRefs",        q(files with bad conrefs), q(file with bad conrefs));
    $save->(1, "badConRefsList",    q(bad conrefs));
    $save->(1, "badGuidHrefs",      q(invalid guid hrefs));
    $save->(1, "badImageRefs",      q(missing image files));
    $save->(1, "badTopicRefs",      q(bad topicrefs));
    $save->(1, "badTables",         q(bad tables));
    $save->(1, "badXml1",           q(bad first lines));
    $save->(1, "badXml2",           q(bad second lines));
    $save->(1, "badXRefs",          q(files with bad xrefs), q(file with bad xrefs));
    $save->(1, "badXRefsList",      q(bad xrefs));
    $save->(1, "duplicateIds",      q(duplicate ids));
    $save->(2, "duplicateTopicIds", q(duplicate topic ids));
    $save->(1, "fixedRefs",         q(references fixed), q(reference fixed));
#   $save->(2, "improvements",      q(improvements));
    $save->(1, "missingImageFiles", q(missing image references));
    $save->(1, "missingTopicIds",   q(missing topic ids));
    $save->(2, "noHref",            q(hrefs missing), q(href missing));
    $save->(1, "notReferenced",     q(files not referenced), q(file not referenced));
    $save->(1, "parseFailed",       q(files failed to parse), q(file failed to parse));
    $save->(2, "validationErrors",  q(validation errors)); # Needs testing
    $save->(2, "xrefBadFormat",     q(External xrefs with no format=html));
    $save->(2, "xrefBadScope",      q(External xrefs with no scope=external));

    $xref->statusLine = @o ? join " ",                                          # Status line
      "Xref:", join ", ",
               map {join " ", @$_}
               sort
                {return $$a[1] cmp $$b[1] if $$b[0] == $$a[0];
                 $$b[0] <=> $$a[0]
                }
               @o : q();

    $xref->statusTable = formatTable
     ([sort {$$b[0] <=> $$a[0]} @o], [qw(Count Condition)]);                    # Summary in status form
    $xref->results = \@o;                                                       # Save status line components

    if (@o and $xref->summary)                                                  # Summary line
     {say STDERR $xref->statusLine;
     }
   }

  $xref                                                                         # Return Xref results
 }

sub countLevels($$)                                                             #P Count has elements to the specified number of levels
 {my ($l, $h) = @_;                                                             # Levels, hash
  if ($l <= 1)
   {return scalar keys @$h if ref($h) =~ m(array)i;
    return scalar keys %$h if ref($h) =~ m(hash)i;
   }
  my $n = 0;
  if   (ref($h) =~ m(hash)i)
   {$n += &countLevels($l-1, $_) for values %$h;
   }
  elsif (ref($h) =~ m(array)i)
   {$n += &countLevels($l-1, $_) for values @$h;
   }
  $n
 }

sub relativeFilePath($$)                                                             #P Format file name for easy use on windows
 {my ($xref, $file) = @_;                                                       # Xref, file

  if (my $w = $xref->relativePath)
   {return relFromAbsAgainstAbs($file, $w);
   }
  $file
 }

sub unixFile($$)                                                                #P Format file name for easy use on unix
 {my ($xref, $file) = @_;                                                       # Xref, file
  return $file; # Because we use these file names as keys and we have not got them all converted
  if (my $u = $xref->relativePath // $xref->inputFolder)
   {if (my $i = $xref->inputFolder)
     {return swapFilePrefix($file, $i, $u);
     }
   }
  $file
 }

sub formatFileNames($$$)                                                        #P Format file names for easy use on unix and windows
 {my ($xref, $array, $column) = @_;                                             # Xref, Array of arrays containing file names in unix format, column containing file names

  for my $array(@$array)
   {if (my $f = $array->[$column])
     {$array->[$column] = relativeFilePath($xref, $f);
      splice @$array, $column+1, 1, unixFile($xref, $f);
     }
   }
 }

sub loadInputFiles($)                                                           #P Load the names of the files to be processed
 {my ($xref) = @_;                                                              # Cross referencer
  $xref->inputFiles = [searchDirectoryTreesForMatchingFiles
    $xref->inputFolder, @{$xref->fileExtensions}];

  my @images = searchDirectoryTreesForMatchingFiles($xref->inputFolder);        # Input files
  $xref->inputFolderImages = {map {fn($_), $_} @images};                        # Image file name which works well for images because the md5 sun in their name is probably unique
 }

sub analyzeOneFile($)                                                           #P Analyze one input file
 {my ($iFile) = @_;                                                             # File to analyze
  my $xref = bless {};                                                          # Cross referencer for this file
     $xref->sourceFile = $iFile;                                                # File analyzed
  my @improvements;                                                             # Possible improvements

  $xref->md5Sum->{$iFile} = fileMd5Sum($iFile);                                 # Md5 sum for input file

# my $x = eval {Data::Edit::Xml::new($iFile, lineNumbers=>1)};                  # Parse xml
  my $x = eval {Data::Edit::Xml::new($iFile)};                                  # Parse xml - at this point if the caller is interested in line numbers they should have added them.

  if ($@)
   {$xref->parseFailed->{$iFile}++;
    return $xref;
   }

  $x->by(sub                                                                    # Each node
   {my ($o) = @_;

    my $content = sub                                                           #P First few characters of content on one line to avoid triggering multi table layouts
     {my ($o) = @_;                                                             # String
      nws($o->stringContent, improvementLength)
     };
    my $loc = sub                                                               #P Location
     {my ($o) = @_;                                                             # String
      ($o->lineLocation, $iFile)
     };
    my $tag = -t $o;

    if (my $i = $o->id)                                                         # Id definitions
     {$xref->ids->{$iFile}{$i}++;
     }
    if ($tag eq q(xref))                                                        # Xrefs but not to the web
     {if (my $h = $o->href)
       {if ($h =~ m(\A(https?://|mailto:|www.))i)                               # Check attributes on external links
         {if ($o->attrX_scope !~ m(\Aexternal\Z)s)
           {$xref->xrefBadScope->{$iFile}{$h} = -A $o;
           }
          if ($o->attrX_format !~ m(\Ahtml\Z)s)
           {$xref->xrefBadFormat->{$iFile}{$h} = -A $o;
           }
         }
        elsif ($h =~ m(\Aguid-)is)                                              # Href is a guid
         {$xref->guidHrefs->{$iFile}{$h} = [$tag, $o->lineLocation];
         }
        else #if ($o->attrX_format =~ m(\Adita)i)                               # Check xref has format=dita AW83 at 2018.12.13 01:10:33
         {$xref->xRefs->{$iFile}{$h}{$o->stringText}++;
         }
       }
      else
       {push @{$xref->noHref->{$iFile}}, [$tag, $o->lineLocation, $iFile];      # No href
       }
     }
    elsif ($tag =~ m(\A(appendix|chapter|link|mapref|notices|topicref)\Z)is)    # References from bookmaps
     {if (my $h = $o->href)
       {if ($h =~ m(\Aguid-)is)                                                 # Href is a guid
         {$xref->guidHrefs->{$iFile}{$h} = [$tag, $o->lineLocation];
         }
        else
         {$xref->topicRefs->{$iFile}{$h}{$o->attr_navtitle//$o->stringText}++;
         }
       }
      else
       {push @{$xref->noHref->{$iFile}}, [$tag, $o->lineLocation, $iFile];      # No href
       }
     }
    elsif ($tag eq q(image))                                                    # Images
     {if (my $h = $o->href)
       {if ($h =~ m(\Aguid-)is)                                                 # Href is a guid
         {$xref->guidHrefs->{$iFile}{$h} = [$tag, $o->lineLocation];            # Resolve image later
         }
        else
         {$xref->images->{$iFile}{$h}++;
         }
       }
      else
       {push @{$xref->noHref->{$iFile}}, [$tag, $o->lineLocation, $iFile];      # No href
       }
     }
    if (my $conref = $o->attr_conref)                                           # Conref
     {$xref->conRefs->{$iFile}{$conref}++;
     }
    if ($o->isText_p)                                                           # Notes
     {my $t = nws($o->text, improvementLength);
      if ($t =~ m(\b(Attention|Caution|Danger|Fastpath|Important|Notice|Note|Remember|Restriction|Tip|Trouble|Warning)\b)is)
       {push @improvements, ["Note", $t, &$loc];
       }
     }
    elsif ($tag eq q(required-cleanup))                                         # Required cleanup
     {my $t = &$content;
      push @improvements, [-t $o, $t, &$loc];
     }
    elsif ($tag eq q(steps-unordered))                                          # Steps unordered
     {my $t = nws(-c $o, improvementLength);
      push @improvements, [-t $o, $t, &$loc];
     }
    elsif ($tag eq q(p))                                                        # Paragraphs with lots of bold
     {my $n = my @c = $o->c_b;
      if ($n >= 3)
       {my $t = &$content;
        push @improvements,
         [q(More than 3 bold in p), $t, &$loc];
       }
     }
    elsif ($tag eq q(title))                                                    # Title
     {my $t = &$content;
      $xref->title->{$iFile} = $t;                                              # Topic Id
      if (my $p = $o->parent)
       {if (my ($w) = split /\s+/, $t, 2)
         {my $task = $w =~ m(\AHow|ing\Z)is;                                    # How/ing concept/task

          if ($p->at_concept && $task)
           {push @improvements, [q(Better as task?),    $t, &$loc];
           }
          elsif ($p->at_task && !$task)
           {push @improvements, [q(Better as concept?), $t, &$loc];
           }
         }
       }
     }
    elsif ($tag eq q(author))                                                   # Author
     {$xref->author->{$iFile} = my $t = &$content;
     }
    elsif ($tag eq q(tgroup))                                                   # Tgroup cols
     {my $error = sub                                                           # Table error message
       {push @{$xref->badTables},
         [join('', @_), $tag, $o->lineLocation, $iFile];
       };

      my $stats     = $o->ditaTGroupStatistics;                                 # Statistics for table
      my $cols      = $stats->colsAttribute;
      my $maxCols   = max($stats->maxHead//0, $stats->maxBody//0);
      my $maxColsMP = max($stats->maxHeadMinusPadding//0,
                          $stats->maxBodyMinusPadding//0);
      if (($stats->maxHead//0) == $maxCols &&                                   # The right combination of body and header
          ($stats->minHead//0) == $maxCols &&
          ($stats->maxBody//0) == $maxCols &&
          ($stats->minBody//0) == $maxCols &&
           $stats->colSpec     == $maxCols
       or !defined($stats->maxHead)        &&                                   # No headers but everything else looks good
          ($stats->maxBody//0) == $maxCols &&
          ($stats->minBody//0) == $maxCols &&
           $stats->colSpec     == $maxCols)
       {if (!$cols)                                                             # Check for cols attribute
         {$error->(qq(No cols attribute, should be $maxCols));
         }
        elsif ($cols ne $maxCols)                                               # Cols present but wrong
         {$error->(qq(Cols attribute is $cols but should be $maxCols));
         }
       }
      elsif (($stats->maxHead//0) < $maxColsMP)                                 # Not enough headers
       {$error->(qq(Not enough headers));
       }
      else
       {$error->(qq(Column padding required));
       }
     }
   });

  push @{$xref->improvements->{$iFile}}, @improvements if @improvements;        # Save improvements

  $xref->topicIds->{$iFile} = $x->id;                                           # Topic Id
  $xref->docType ->{$iFile} = $x->tag;                                          # Document type
  $xref->attributeCount->{$iFile} = $x->countAttrNames;                         # Attribute names
  $xref->tagCount      ->{$iFile} = $x->countTagNames;                          # Tag names
  $xref->vocabulary    ->{$iFile} = $x->stringTagsAndText;                      # Text of topic minus attributes

  if (1)                                                                        # Check xml headers and lint errors
   {my @h = split /\n/, my $s = readFile($iFile);
    if (!$h[0] or $h[0] !~ m(\A<\?xml version=\"1.0\" encoding=\"UTF-8\"\?>\Z))
     {$xref->badXml1->{$iFile}++;
     }
    my $tag = $x->tag;
    if (!$h[1] or $h[1] !~ m(\A<!DOCTYPE $tag PUBLIC "-//))
     {$xref->badXml2->{$iFile}++;
     }

    $xref->validationErrors->{$iFile}++ if $s =~ m(<!--compressedErrors:)s;     # File has validation errors
   }

  $xref
 } # analyzeOneFile

sub reportGuidsToFiles($)                                                         #P Map and report guids to files
 {my ($xref) = @_;                                                              # Xref results
  my @r;
  for   my $file(sort keys %{$xref->topicIds})                                  # Each input file which will be absolute
   {if (my $topicId = $xref->topicIds->{$file})                                 # Topic Id for file - we report missing topicIs in: reportDuplicateTopicIds
     {next unless $topicId =~ m(\AGUID-)is;
      $xref->guidToFile->{$topicId} = $file;                                    # Guid Topic Id to file
      push @r, [$topicId, $file];
     }
   }
  formatFileNames($xref, \@r,  1);                                              # Format file names for easy use on unix and windows

  formatTable(\@r, <<END,
Guid            The guid being defined
Relative_Path   The source file that defines the guid as a relative file name
Absolute_Path   The source file that defines the guid as an absolute file path
END
    title    =>qq(Guid topic definitions),
    head     =>qq(Xref found NNNN guid topic definitions on DDDD),
    summarize=>1,
    file     =>fpe($xref->reports, q(lists), qw(guidsToFiles txt)));
 }

sub fixOneFile($$)                                                              #P Fix one file by moving unresolved references to the xtrf attribute
 {my ($xref, $file) = @_;                                                       # Xref results, file to fix
  my @r;                                                                        # Count of tags changed

  my $x = Data::Edit::Xml::new($file);                                          # Parse xml - should parse OK else otherwise how did we find out that this file needed to be fixed

  $x->by(sub                                                                    # Each node
   {my ($o) = @_;
    my $t  = $o->tag;                                                           # Tag
    if ($t =~  m(\A(appendix|chapter|image|link|topicref|xref)\Z)is)            # Hrefs that need to be fixed
     {if (my $h = $o->href)                                                     # Fix the href by moving it to xtrf
       {if ($xref->fixRefs->{$file}{$h})
         {if ($h =~ m(\AGUID-)is)                                               # Fixable guid reference
           {if (my $f = $xref->guidToFile->{$h})                                # File associated with guid
             {$o->href = my $H = relFromAbsAgainstAbs($f, $file);               # New href
              $o->xtrf = $h;                                                    # Save original xref so we can find it with grep
              push @r, [q(File from Guid), -A $o, $H, $file];                   # Report fix
             }
            else
             {$o->renameAttr_href_xtrf;                                         # No target file for guid
              push @r, [q(No file for Guid), -A $o, q(), $file];                # Report fix
             }
           }
          else                                                                  # Move href to xtrf as no other fix seems possible
           {$o->renameAttr_href_xtrf;
            push @r, [q(Href to xtrf), -A $o, q(), $file];
           }
         }
        else                                                                    # Fix not requested so href left alone
         {#push @r, [q(Fix not requested), -A $o, q(), $file];                  # No fix for the href has been requested so no need to report it as fixed
         }
       }
      else                                                                      # No href
       {push @r, [q(No Href), -A $o, q(), $file];
       }
     }
    if (my $conref = $o->attr_conref)                                           # Fix the conref by moving it to xtrf
     {if ($xref->fixRefs->{$file}{$conref})
       {$o->renameAttr_conref_xtrf;
        push @r, [q(Moved conref to xtrf),     -A $o, $conref, $file];          # Report fix
       }
      else
       {#push @r, [q(Conref fix not requested), -A $o, $conref, $file];         # Report not fixed
       }
     }
   });

  if (1)                                                                        # Replace xml in source file
   {my ($l, $L) = split m/\n/, readFile($file);
    my $t = -p $x;
    if ($l =~ m(\A<\?xml))                                                      # Check headers - should be improved
     {owf($file, qq($l\n$L\n$t));
     }
    else
     {owf($file, $t);
     }
   }

  \@r                                                                           # Return report of items fixed
 }

sub fixFiles($)                                                                 #P Fix files by moving unresolved references to the xtrf attribute
 {my ($xref) = @_;                                                              # Xref results
  my @r;                                                                        # Fixes made
  if (my @files = sort keys %{$xref->fixRefs})                                  # Fix files if requested
   {my @square = squareArray(@files);                                           # Divide the task

    my $ps = newProcessStarter($xref->maximumNumberOfProcesses);                # Process starter
       $ps->processingTitle   = q(Xref);
       $ps->totalToBeStarted  = scalar @square;
       $ps->processingLogFile = fpe($xref->reports, qw(log xref fix txt));

    for my $row(@square)                                                        # Each row of input files file
     {$ps->start(sub
       {my @r;                                                                  # Results
        for my $col(@$row)                                                      # Each column in the row
         {push @r, $xref->fixOneFile($col);                                     # Analyze one input file
         }
        [@r]                                                                    # Return results as a reference
       });
     }

    for my $r(deSquareArray($ps->finish))                                       # Consolidate results
     {push @r, @$r;
     }
   }

  formatFileNames($xref, \@r,  3);                                              # Format file names for easy use on unix and windows

  formatTable($xref->fixedRefs = \@r, <<END,                                    # Report results
Fix            The type of fix applied
Item           The item being fixed
Target         The new target the reference has been directed to
Relative_Path  The source file being fixed as a relative file name.
Absolute_Path  The source file being fixed as an absolute file path.
END
    summarize=>1,
    title=>qq(Fixes Applied To Failing References),
    head=><<END,
Xref applied the following fixes to failing hrefs and conrefs as requested by
the fixBadRefs parameter on DDDD
END
    file=>(my $f = fpe($xref->reports, qw(lists referencesFixed txt))));
 }

sub analyze($)                                                                  #P Analyze the input files
 {my ($xref) = @_;                                                              # Cross referencer
  my @in = @{$xref->inputFiles};                                                # Input files
  my @square = squareArray(@in);                                                # Divide the task

  my $ps = newProcessStarter($xref->maximumNumberOfProcesses);                  # Process starter
     $ps->processingTitle   = q(Xref);
     $ps->totalToBeStarted  = scalar @square;
     $ps->processingLogFile = fpe($xref->reports, qw(log xref analyze txt));

  for my $row(@square)                                                          # Each row of input files file
   {$ps->start(sub
     {my @r;                                                                    # Results
      for my $col(@$row)                                                        # Each column in the row
       {push @r, analyzeOneFile($col);                                          # Analyze one input file
       }
      [@r]                                                                      # Return results as a reference
     });
   }

  for my $x(deSquareArray($ps->finish))                                         # mmmm Merge results from each file analyzed
   {for my $field(                                                              # Merge hashes by file names which are unique - ffff
      qw(
attributeCount
author
badXml1
badXml2
conRefs
docType
fixRefs
guidHrefs
ids
images
improvements
md5Sum
noHref
parseFailed
tagCount
title
topicIds
topicRefs
validationErrors
vocabulary
xrefBadFormat
xrefBadScope
xRefs
         ))
     {next unless my $xf = $x->{$field};
      for my $f(sort keys %$xf)
       {$xref->{$field}{$f} = $xf->{$f};
       }
     }
    for my $field(                                                              # Merge arrays
      qw(badTables))
     {next unless my $xf = $x->{$field};
      push @{$xref->{$field}}, @$xf;
     }
   }
 }

sub reportDuplicateIds($)                                                       #P Report duplicate ids
 {my ($xref) = @_;                                                              # Cross referencer

  my @dups;                                                                     # Duplicate ids definitions
  for my $file(sort keys %{$xref->ids})                                         # Each input file
   {for my $id(sort keys %{$xref->ids->{$file}})                                # Each id in the file
     {my $count = $xref->ids->{$file}{$id};                                     # Number of definitions of this id in the file
      if ($count > 1)                                                           # Duplicate definition
       {push @dups, [$id, $count, $file];                                       # Save details of duplicate definition
       }
     }
   }

  $xref->duplicateIds = {map {$$_[2]=>$_} @dups};                               # All duplicates

  formatTable(\@dups, [qw(Id Count File)],
    title=>qq(Duplicate id definitions within files),
    head=><<END,
Xref found NNNN duplicate id definitions within files on DDDD

These ids are duplicated within a file, possibly because they were copied from
another part of the same file.  This report does not show ids that are the same
in different files as this is not a problem using Dita's three part addressing
scheme which requires only that the topic id be unique across all files.

Duplicate topic ids are reported in ../bad/topicIds.txt.

END
    file=>(my $f = fpe($xref->reports, qw(bad idDefinitionsDuplicated txt))));
 }

sub reportDuplicateTopicIds($)                                                  #P Report duplicate topic ids
 {my ($xref) = @_;                                                              # Cross referencer

  my %dups;                                                                     # Duplicate topic ids definitions
  my @dups;                                                                     # Duplicate topic ids definitions report
  my @miss;                                                                     # Missing topic id definitions report
  for my $file(sort keys %{$xref->topicIds})                                    # Each input file
   {if (my $i = $xref->topicIds->{$file})                                       # Topic Id
     {if (my $d = $dups{$i})                                                    # Duplicate topic id
       {push @dups, [$i, $file, $d];                                            # Save details of duplicate definition
       }
      else
       {$dups{$i} = $file;                                                      # Save topic id
       }
     }
    else
     {push @miss, [$file];                                                        # Missing topic id
     }
   }

  $xref->duplicateTopicIds = {map {$$_[0]=>$_} @dups};                          # All duplicates
  $xref->missingTopicIds   = {map {$$_[0]=>$_} @miss};                          # All missing

  formatTable(\@dups, [qw(TopicId File1 File2)],
    title=>qq(Duplicate topic id definitions),
    head=><<END,
Xref found NNNN duplicate topic id definitions on DDDD

File1, File2 are two files that both define TopicId

END
    file=>(fpe($xref->reports, qw(bad topicIdDefinitionsDuplicated txt))));

  formatTable(\@miss, [qw(File)],
    title=>qq(Topics without ids),
    head=><<END,
Xref found NNNN topics that have no topic id on DDDD

END
    file=>(fpe($xref->reports, qw(bad topicIdDefinitionsMissing txt))));
 }

sub reportNoHrefs($)                                                            #P Report locations where an href was expected but not found
 {my ($xref) = @_;                                                              # Cross referencer
  my @t;
  for my $file(sort keys %{$xref->noHref})                                      # Each input file
   {push @t,             @{$xref->noHref->{$file}};                             # Missing href details
   }

  formatFileNames($xref, \@t,  2);                                              # Format file names for easy use on unix and windows

  formatTable(\@t, <<END,
Tag            A tag that should have an xref.
Location       The location of the tag that should have an xref.
Relative_Path  The source file containing the tag as a relative file name.
Absolute_Path  The source file containing the tag as an absolute file path.
END
    title=>qq(Missing hrefs),
    head=><<END,
Xref found NNNN tags that should have href attributes but did not on DDDD
END
    file=>(fpe($xref->reports, qw(bad missingHrefAttributes txt))));
 }

sub reportRefs($$)                                                              #P Report bad references found in xrefs or conrefs as they have the same structure
 {my ($xref, $type) = @_;                                                       # Cross referencer, type of reference to be processed

  my @bad; my @good;                                                            # Bad xrefs.
  for   my $file(sort keys %{$xref->{${type}.q(Refs)}})                         # Each input file which will be absolute
   {my $sourceTopicId = $xref->topicIds->{$file};
    for my $href(sort keys %{$xref->{${type}.q(Refs)}{$file}})                  # Each href in the file which will be relative
     {my @text;
      if (               ref($xref->{${type}.q(Refs)}{$file}{$href}))           # xRef: Text associated with reference deemed helpful by Bill
       {@text =  sort keys %{$xref->{${type}.q(Refs)}{$file}{$href}};
        s(\s+) ( )gs for @text;                                                 # Normalize white space
       }
      if ($href =~ m(#))                                                        # Href with #
       {my ($hFile, $hId) = split m(#), $href;                                  # File, topicId components
        my ($topic, $id)  = split m(/), $hId;                                   # Topic, id
                    $id //= '';
           $topic  = $sourceTopicId if $topic eq q(.);                          # In topic reference
        my $target = $hFile ? absFromAbsPlusRel($file, $hFile) : $file;         # Target file absolute

        my $good = sub                                                          # Save a good reference
         {push @good, [$href, $target, $file];
         };

        my $bad = sub                                                           # Save a bad reference
         {my ($reason, $t) = @_;
          push @bad,
           [$reason, $href,
            $hFile, $topic, $id, $t, $sourceTopicId, $file, $target, @text];
         };

        if ($hFile and !(-e $target or -e wwwDecode($target)))                  # Check target file
         {&$bad(q(No such file), q());
         }
        elsif (my $t = $xref->topicIds->{$target})                              # Check topic id
         {if ($t eq $topic)
           {if (my $i = $xref->ids->{$target}{$id})
             {if ($i == 1)
               {&$good;
               }
              else
               {&$bad(q(Duplicate id in topic), $t);
               }
             }
            elsif ($id)
             {&$bad(q(No such id in topic), $t);
             }
            else
             {&$good;
             }
           }
          else
           {&$bad(q(Topic id does not match target topic), $t);
           }
         }
        elsif ($topic =~ m(\S)s)                                                # The href contains a topic id but there is not topic with that id
         {&$bad(q(No topic id on topic in target file), $t);
         }
        else
         {&$good;
         }
       }
      else                                                                      # No # in href
       {my $target = absFromAbsPlusRel($file, $href);
        if (!-e $target and !-e wwwDecode($target))                             # Actual file name or www encoded file name
         {push @bad, my $p = [qq(No such file), $href,
           $target, q(), q(), q(), $sourceTopicId, $file, $target, @text];
         }
        else
         {push @good, my $p = [$href, $target, $file];
         }
       }
     }
   }

  for my $bad(@bad)                                                             # List of files to fix
   {my $href = $$bad[1];
    my $file = $$bad[7];
    $xref->fixRefs->{$file}{$href}++;
   }

  my $Type = ucfirst $type;
  $xref->{q(bad).$Type.q(Refs)}  = {map {$$_[7]=>$_} @bad};                     # Bad references
  $xref->{q(good).$Type.q(Refs)} = {map {$$_[1]=>$_} @good};                    # Good references

  formatFileNames($xref, \@bad,   7);                                           # Format file names for easy use on unix and windows
  formatFileNames($xref, \@good,  1);                                           # Format file names for easy use on unix and windows

  $xref->{q(bad).$Type.q(RefsList)}  = \@bad;                                   # Bad references list
  $xref->{q(good).$Type.q(RefsList)} = \@good;                                  # Good references list

  my $in = $xref->inputFolder//'';
  formatTable(\@bad, <<END,
Reason          The reason why the conref failed to resolve
Href            The href in the source file
Href_File       The target file referenced by the href in the source files
Href_Topic_Id   The id of the topic referenced by the href in the source file
Target_Topic_Id The actual id of the topic in the target file
HRef_Id         The id of the statement in the body of the topic referenced by the href in the source file
Source_TopicId  The topic id at the top of the source file containing the bad reference
Relative_Path   The source file containing the reference as a relative file name
Absolute_Path   The source file containing the reference as an absolute file path
Target_File     The target file
Example_Text    Any text associated with the link such as the navtitle of a bad topicRef or the CDATA text of an xref.
END
    title    =>qq(Bad ${type}Refs),
    head     =>qq(Xref found NNNN Bad ${type}Refs on DDDD),
    summarize=>1, csv=>1,
    wide     =>1,
    file     =>(fpe($xref->reports, q(bad), qq(${Type}Refs), q(txt))));

  formatTable(\@good, <<END,
Href            The href in the source file
Source_Window   The source file containing the reference as a relative file name
Absolute_Path     The source file containing the reference as an absolute file path
Target_File     The target file
END
    title    =>qq(Good ${type}Refs),
    head     =>qq(Xref found NNNN Good $type refs on DDDD),
    file     =>(fpe($xref->reports, q(good), qq(${Type}Refs), q(txt))));
 } # reportRefs

sub reportGuidHrefs($)                                                          #P Report on guid hrefs
 {my ($xref) = @_;                                                              # Cross referencer

  my %guidToFile;                                                               # Map guids to files
  for   my $file(sort keys %{$xref->topicIds})                                  # Each input file containing a topic id
   {my $id = $xref->topicIds->{$file};                                          # Each href in the file which will start with guid
    next unless defined $id;
    next unless $id =~ m(\Aguid-)is;
    $guidToFile{$id} = $file;                                                   # We report duplicates in reportDuplicateTopicIds
   }

  my @bad; my @good;                                                            # Good and bad guid hrefs
  for   my $file(sort keys %{$xref->guidHrefs})                                 # Each input file which will be absolute
   {my $sourceTopicId = $xref->topicIds->{$file};
    for my $href(sort keys %{$xref->guidHrefs->{$file}})                        # Each href in the file which will start with guid
     {my ($tag, $lineLocation) = @{$xref->guidHrefs->{$file}{$href}};           # Tag of node and location in source file of node doing the referencing

      $xref->fixRefs->{$file}{$href}++ unless $xref->fixRefs->{$file}{$href};   # Avoid double counting - all guid hrefs will be fixed if we are fixing hrefs as both good and bad will fail.

      if ($href =~ m(#))                                                        # Href with #
       {my ($guid, $topic, $id) = split m(#|\/), $href, 3;                      # Guid, topic, remainder
        my $targetFile   = $guidToFile{$guid};                                  # Locate file defining guid

        if (!defined $targetFile)                                               # No definition of this guid
         {push @bad,                                                            # Report missing guid
           ["No such guid defined", $tag, $href, $lineLocation, q(),
            $sourceTopicId, $targetFile, $file];
          next;
         }

        my $targetFileId = $xref->topicIds->{$targetFile} // '';                # Actual id in target file

        my $bad = sub
         {push @bad,
           [@_, $tag, $href, $lineLocation, $targetFileId, $sourceTopicId,
            $targetFile, $file];
         };

        my $good = sub
         {push @good,
           [$href, $tag, $lineLocation, $targetFile, $file];
         };

        if (!-e $targetFile)                                                    # Existence of file
         {$bad->(q(No such file));
         }
        elsif (defined $topic)                                                  # Topic defined so it must be an xref
         {if ($topic ne $guid)
           {$bad->(q(Guid does not match topic id));
           }
          elsif (defined $id)
           {if (my $i = $xref->ids->{$targetFile}{$id})                         # Check id exists in target file
             {if ($i == 1)
               {&$good;
               }
              else

               {$bad->(q(Duplicate id in topic));
               }
             }
            $bad->(q(No such id in topic));
           }
          else
           {&$good;
           }
         }
        else
         {&$good;
         }
       }
      elsif ($tag eq q(image))                                                  # Image reference
       {my $guid = $href =~ s(guid|-) ()igsr;
        if (my $image = $xref->inputFolderImages->{$guid})
         {push @good, [$tag, $href, $lineLocation, $image, $file];
          $xref->goodImageRefs->{$image}++;                                     # Found image
         }
        else
         {push @bad, [qq(No such image guid defined), $tag, $href,
           $lineLocation, q(), $sourceTopicId, q(), $file];
         }
       }
      else                                                                      # No # in href and not an image so it must be a bookmap element
       {my $targetFile = $guidToFile{$href};
        if (!defined $targetFile)                                               # No such guid
         {push @bad, [qq(No such guid defined), $tag, $href,
           $lineLocation, q(), $sourceTopicId, q(), $file];
         }
        elsif (!-e $targetFile)                                                 # Actual file name
         {push @bad, my $p = [qq(No such file), $tag, $href,
           $lineLocation, q(), $sourceTopicId, $targetFile, $file];
         }
        elsif ($xref->fixBadRefs)                                               # The file exists and we want to fix such references
         {$xref->fixRefs->{$file}{$href}++;
         }
        else
         {push @good, [$tag, $href, $lineLocation, $targetFile, $file];
          $xref->goodTopicRefs->{$targetFile}++;                                # Mark reference as found
         }
       }
     }
   }

  for my $bad(@bad)                                                             # List of files to fix
   {my $href = $$bad[2];
    my $file = $$bad[-1];
#   $xref->fixRefs->{$file}{$href}++ unless $xref->fixRefs->{$file}{$href};     # Avoid double counting
   }

  $xref->{badGuidHrefs}  = {map {$$_[7]=>$_} @bad};                             # Bad references
  $xref->{goodGuidHrefs} = {map {$$_[4]=>$_} @good};                            # Good references

  formatFileNames($xref, \@bad,   7);                                           # Format file names for easy use on unix and windows
  formatFileNames($xref, \@good,  4);                                           # Format file names for easy use on unix and windows

  my $in = $xref->inputFolder//'';
  formatTable(\@bad, <<END,
Reason          The reason why the href failed to resolve
Tag             The tag of the node doing the referencing
Href            The href of the node doing the referencing
Line_Location   The line location where the href occurred in the source file
Target_Topic_Id The actual id of the topic in the target file
Source_Topic_Id The topic id in the source file
Target_File     The target file
Relative_Path   The source file containing the reference as a relative file name
Absolute_Path   The source file containing the reference as an absolute file path
END
    title    =>qq(Unresolved GUID hrefs),
    head     =>qq(Xref found NNNN unresolved GUID hrefs on DDDD),
    summarize=>1,
    wide     =>1,
    file     =>(fpe($xref->reports, q(bad), qw(guidHrefs txt))));

  formatTable(\@good, <<END,
Href            The href in the source file
Line_Location   The line location where the href occurred in the source file
Target_File     The target file
Relative_Path   The source file containing the reference as a relative file name
Absolute_Path   The source file containing the reference as an absolute file path
Target_File     The target file
END
    title    =>qq(Resolved GUID hrefs),
    head     =>qq(Xref found NNNN Resolved GUID hrefs on DDDD),
    file     =>(fpe($xref->reports, q(good), qw(guidHrefs txt))));
 } # reportGuidHrefs

sub reportXrefs($)                                                              #P Report bad xrefs
 {my ($xref) = @_;                                                              # Cross referencer
  reportRefs($xref, q(x));
 }

sub reportTopicRefs($)                                                          #P Report bad topic refs
 {my ($xref) = @_;                                                              # Cross referencer

  my %topicIdsToFile;                                                           # All the topic ids encountered - we have already reported the duplicates so now we can assume that there are no duplicates
  for my $file(sort keys %{$xref->topicIds})                                    # Each input file
   {if (my $topicId = $xref->topicIds->{$file})                                 # Topic Id for file - we report missing topicIs in: reportDuplicateTopicIds
     {$topicIdsToFile{$topicId} = $file;                                        # Topic Id to file
     }
   }

  my @bad; my @good;                                                            # Bad xrefs
  for   my $file(sort keys %{$xref->topicRefs})                                 # Each input file
   {my $sourceTopicId = $xref->topicIds->{$file};
    for my $href(sort keys %{$xref->topicRefs->{$file}})                        # Each topic ref in the file
     {my @text;

if ($href =~ m(#)s) # We will have to do something about this if we encounter href on topic/link ref that has # in the href.
 {cluck "# in href in topic reference requires new code";
 }
      next if $topicIdsToFile{$href};                                           # The href is satisfied by the topic id of a file containing a topic - we will assume that this has occurred as a result of renaming files and so is ok

      if (               ref($xref->topicRefs->{$file}{$href}))                 # Text associated with reference
       {@text =  sort keys %{$xref->topicRefs->{$file}{$href}};
        s(\s+) ( )gs for @text;                                                 # Normalize white space
       }

      my $f = absFromAbsPlusRel(fullFileName($file), $href);                    # Target file absolute
      if ($f)
       {if (!-e $f and !-e wwwDecode($f))                                       # Check target file
         {push @bad, my $p = [qq(No such file), $f, qq("$href"),
                             $sourceTopicId, $file, @text];
          $xref->fixRefs->{$file}{$href}++;
         }
        else
         {push @good, my $p = [$f, $href, $file];
         }
       }
     }
   }
  formatFileNames($xref, \@good, 0);                                            # Format file names for easy use on unix and windows

  $xref->badTopicRefs  = {map {$$_[1]=>$_} @bad};                               # Bad topic references
  $xref->goodTopicRefs = {map {$$_[1]=>$_} @good};                              # Good topic references

  my $in = $xref->inputFolder//'';
  formatTable(\@bad, <<END,
Reason          Reason the topic reference failed
FullFileName    Name of the targeted file
Href            Href text
Source_Topic_Id The topic id of the file containing the bad xref
Relative_Path   The source file containing the reference as a relative file name
Absolute_Path   The source file containing the reference as an absolute file path
Example_Text    Any text bracketed by the topic ref
END
    title    =>qq(Bad topicrefs),
    head     =>qq(Xref found NNNN Bad topicrefs on DDDD),
    summarize=>1,
    wide     =>1,
    file     =>(fpe($xref->reports, qw(bad topicRefs txt))));

  formatTable(\@good, <<END,
FullFileName  The target file name
Href          The href text in the source file
Source        The source file
END
    title=>qq(Good topicrefs),
    head=>qq(Xref found NNNN Good topicrefs on DDDD),
    file=>(fpe($xref->reports, qw(good topicRefs txt))));
 }

sub reportConrefs($)                                                            #P Report bad conrefs refs
 {my ($xref) = @_;                                                              # Cross referencer
  reportRefs($xref, q(con));
 }

sub reportImages($)                                                             #P Reports on images and references to images
 {my ($xref) = @_;                                                              # Cross referencer

  my @bad;                                                                      # Bad images
  for my $file(sort keys %{$xref->images})                                      # Each input file
   {my $sourceTopicId = $xref->topicIds->{$file};
    for my $href(sort keys %{$xref->images->{$file}})                           # Each image in the file
     {my $image = absFromAbsPlusRel($file, $href);                              # Image relative to current file
      if (-e $image or -e wwwDecode($image))                                    # Actual image name or www encoded image name
       {$xref->goodImageRefs->{$image}++;                                       # Found image
       }
      else
       {push @bad, [$href, $image, $sourceTopicId, $file];                      # Missing image reference
        $xref->badImageRefs->{$image}++;                                        # Number of missing references
        $xref->fixRefs->{$file}{$href}++;
       }
     }
   }

  $xref->missingImageFiles = [@bad];                                            # Missing image file names

  formatTable([sort {$$a[0] cmp $$b[0]} @bad], <<END,
Href            Image reference in source file
Image           Targetted image name
Source_Topic_Id The topic id of the file containing the missing image
Source          The source file containing the image reference as an absolute file path

END
    title=>qq(Bad image references),
    head=>qq(Xref found NNNN bad image references on DDDD),
    summarize=>1,
    file=>(my $f = fpe($xref->reports, qw(bad imageRefs txt))));

  my $found = [map {[$xref->goodImageRefs->{$_}, $_]}
              keys %{$xref->goodImageRefs}];

  formatTable($found, <<END,
Count          Number of references to each image file found.
ImageFileName  Full image file name
END
    title=>qq(Image files),
    head=>qq(Xref found NNNN image files found on DDDD),
    file=>(fpe($xref->reports, qw(good imagesFound txt))));

  my $missing = [map {[$xref->badImageRefs->{$_}, $_]}
                 sort keys %{$xref->badImageRefs}];

  formatTable($missing, <<END,
Count          Number of references to each image file found.
ImageFileName  Full image file name
END
    title=>qq(Missing image references),
    head=>qq(Xref found NNNN images missing on DDDD),
    file=>(fpe($xref->reports, qw(bad imagesMissing txt))));
 }

sub reportParseFailed($)                                                        #P Report failed parses
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable($xref->parseFailed, <<END,
Source The file that failed to parse as an absolute file path
END
    title=>qq(Files failed to parse),
    head=>qq(Xref found NNNN files failed to parse on DDDD),
    file=>(my $f = fpe($xref->reports, qw(bad parseFailed txt))));
 }

sub reportXml1($)                                                            #P Report bad xml on line 1
 {my ($xref) = @_;                                                              # Cross referencer


  formatTable([sort keys %{$xref->badXml1}], <<END,
Source  The source file containing bad xml on line
END
    title=>qq(Bad Xml line 1),
    head=>qq(Xref found NNNN Files with the incorrect xml on line 1 on DDDD),
    file=>(my $f = fpe($xref->reports, qw(bad xmlLine1 txt))));
 }

sub reportXml2($)                                                            #P Report bad xml on line 2
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable([sort keys %{$xref->badXml2}], <<END,
Source  The source file containing bad xml on line
END
    title=>qq(Bad Xml line 2),
    head=>qq(Xref found NNNN Files with the incorrect xml on line 2 on DDDD),
    file=>(my $f = fpe($xref->reports, qw(bad xmlLine2 txt))));
 }

sub reportDocTypeCount($)                                                       #P Report doc type count
 {my ($xref) = @_;                                                              # Cross referencer

  my %d;
  for my $f(sort keys %{$xref->docType})
   {my $d = $xref->docType->{$f};
    $d{$d}++
   }

  formatTable(\%d, [qw(DocType)],
    title=>qq(Document types),
    head=>qq(Xref found NNNN different doc types on DDDD),
    file=>(fpe($xref->reports, qw(count docTypes txt))));
 }

sub reportTagCount($)                                                           #P Report tag counts
 {my ($xref) = @_;                                                              # Cross referencer

  my %d;
  for   my $f(sort keys %{$xref->tagCount})
   {for my $t(sort keys %{$xref->tagCount->{$f}})
     {my $d = $xref->tagCount->{$f}{$t};
      $d{$t} += $d;
     }
   }

  formatTable(\%d, [qw(Tag Count)],
    title=>qq(Tags),
    head=>qq(Xref found NNNN different tags on DDDD),
    file=>(fpe($xref->reports, qw(count tags txt))));
 }

sub reportAttributeCount($)                                                     #P Report attribute counts
 {my ($xref) = @_;                                                              # Cross referencer

  my %d;
  for   my $f(sort keys %{$xref->attributeCount})
   {for my $t(sort keys %{$xref->attributeCount->{$f}})
     {my $d = $xref->attributeCount->{$f}{$t};
      $d{$t} += $d;
     }
   }

  formatTable(\%d, [qw(Attribute Count)],
    title=>qq(Attributes),
    head=>qq(Xref found NNNN different attributes on DDDD),
    file=>(my $f = fpe($xref->reports, qw(count attributes txt))));
 }

sub reportValidationErrors($)                                                   #P Report the files known to have validation errors
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable([map {[$_]} sort keys %{$xref->validationErrors}], [qw(File)],
    title=>qq(Topics with validation errors),
    head=><<END,
Xref found NNNN topics with validation errors on DDDD
END
    file=>(fpe($xref->reports, qw(bad validationErrors txt))));
 }

sub checkBookMap($$)                                                            #P Check whether a bookmap is valid or not
 {my ($xref, $bookMap) = @_;                                                    # Cross referencer, bookmap

  for my $href($bookMap, sort keys %{$xref->topicRefs->{$bookMap}})             # Each topic ref in the bookmap
   {my $t = absFromAbsPlusRel($bookMap, $href);
    for my $field                                                               # Fields that report errors
     (qw(parseFailed badXml1 badXml2 badTopicRefs badXRefs
         imagesMissing badConRefs missingTopicIds
         validationErrors))
     {if ($xref->{$field}->{$t})
       {return [$field, $xref->topicIds->{$bookMap}, $bookMap, $href, $t];
       }
     }
   }
  undef                                                                         # No errors
 }

sub reportBookMaps($)                                                           #P Report on whether each bookmap is good or bad
 {my ($xref) = @_;                                                              # Cross referencer

  my @bad;
  my @good;
  for my $f(sort keys %{$xref->docType})
   {if ($xref->docType->{$f} =~ m(map\Z)s)
     {if (my $r = $xref->checkBookMap($f))
       {push @bad, $r;
       }
      else
       {push @good, [$f];
       }
     }
   }
  $xref-> badBookMaps = [@bad];                                                 # Bad bookmaps
  $xref->goodBookMaps = [@good];                                                # Good book maps

  formatTable(\@bad, <<END,
Reason          Reason bookmap failed
Source_Topic_Id The topic id of the failing bookmap
Bookmap         Bookmap source file name
Topic_Ref       Failing appendix, chapter or topic ref.
Topic_File      Targeted topic file if known
END
    title=>qq(Bookmaps with errors),
    head=><<END,
Xref found NNNN bookmaps with errors on DDDD
END
    summarize=>1,
    file=>(fpe($xref->reports, qw(bad bookMap txt))));

  formatTable(\@good, [qw(File)],
    title=>qq(Good bookmaps),
    head=><<END,
Xref found NNNN good bookmaps on DDDD
END
    file=>(fpe($xref->reports, qw(good bookMap txt))));
 }

sub reportTables($)                                                             #P Report on tables that have problems
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable($xref->badTables, <<END,
Reason          Reason bookmap failed
Attributes      The tag and attributes of the table element in question
Location        The location at which the error was detected
Source_File     The file in which the error was detected
END
    title=>qq(Tables with errors),
    head=><<END,
Xref found NNNN table errors on DDDD
END
    summarize=>1,
    file=>(fpe($xref->reports, qw(bad tables txt))));
 }

sub reportFileExtensionCount($)                                                 #P Report file extension counts
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable(countFileExtensions($xref->inputFolder), [qw(Ext Count)],
    title=>qq(File extensions),
    head=><<END,
Xref found NNNN different file extensions on DDDD
END
    file=>(fpe($xref->reports, qw(count fileExtensions txt))));
 }

sub reportFileTypes($)                                                          #P Report file type counts - takes too long in series
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable(countFileTypes
   ($xref->inputFolder, $xref->maximumNumberOfProcesses),
   [qw(Type Count)],
    title=>qq(Files types),
    head=><<END,
Xref found NNNN different file types on DDDD
END
    file=>(my $f = fpe($xref->reports, qw(count fileTypes txt))));
 }

sub reportNotReferenced($)                                                      #P Report files not referenced by any of conref, image, topicref, xref and are not bookmaps.
 {my ($xref) = @_;                                                              # Cross referencer

  my %files = map {$_=>1}
    searchDirectoryTreesForMatchingFiles($xref->inputFolder);

  my %target;                                                                   # Targets of xrefs and conrefs
  $target{$xref->{goodConRefs}{$_}[2]}++ for keys %{$xref->{goodConRefs}};
  $target{$xref->{goodXRefs}  {$_}[2]}++ for keys %{$xref->{goodXRefs}};

  for my $file(sort keys %{$xref->goodImageRefs},                               # Remove referenced files
               sort keys %{$xref->goodTopicRefs},
               sort keys %target,
              )
   {delete $files{$file};
   }

  for my $file(sort keys %{$xref->docType})                                     # Remove bookmaps from consideration as they are not usually referenced
   {my $tag = $xref->docType->{$file};
    if ($tag =~ m(\Abookmap\Z)is)
     {delete $files{$file};
     }
   }

  $xref->notReferenced = \%files;
  formatTable([sort keys %files],
   [qw(FileNo Unreferenced)],
    title=>qq(Unreferenced files),
    head=><<END,
Xref found NNNN unreferenced files on DDDD.

These files are not mentioned in any href attribute and are not bookmaps.

END
    file=>(my $f = fpe($xref->reports, qw(bad notReferenced txt))));
 }

sub reportExternalXrefs($)                                                      #P Report external xrefs missing other attributes
 {my ($xref) = @_;                                                              # Cross referencer

  my @s;
  for   my $f(sort keys %{$xref->xrefBadScope})
   {my $sourceTopicId = $xref->topicIds->{$f};
    for my $h(sort keys %{$xref->xrefBadScope->{$f}})
     {my $s = $xref->xrefBadScope->{$f}{$h};
      push @s, [q(Bad scope attribute), $h, $s, $sourceTopicId, $f];
     }
   }

  for   my $f(sort keys %{$xref->xrefBadFormat})
   {my $sourceTopicId = $xref->topicIds->{$f};
    for my $h(sort keys %{$xref->xrefBadFormat->{$f}})
     {my $s = $xref->xrefBadFormat->{$f}{$h};
      push @s, [q(Bad format attribute), $h, $s, $sourceTopicId, $f];
     }
   }

  formatFileNames($xref, \@s, 4);                                               # Format file names for easy use on unix and windows

  formatTable(\@s, <<END,
Reason          The reason why the xref is unsatisfactory
Href            The href attribute of the xref in question
Xref_Statement  The xref statement in question
Source_Topic_Id The topic id of the source file containing file containing the bad external xref
File            The file containing the xref statement in question
File_Windows    The source file containing the xref statement in question as a relative file name
File_Unix       The source file containing the xref statement in question as an absolute file path
END
    title=>qq(Bad external xrefs),
    head=>qq(Xref found bad external xrefs on DDDD),
    file=>(my $f = fpe($xref->reports, qw(bad externalXrefs txt))));
 }

sub reportPossibleImprovements($)                                               #P Report improvements possible
 {my ($xref) = @_;                                                              # Cross referencer

  my @S;
  for   my $i(sort keys %{$xref->improvements})
   {push @S, @{$xref->improvements->{$i}};
   }

  formatFileNames($xref, \@S, 3);                                               # Format file names for easy use on unix and windows

  my @s = sort {$$a[0] cmp $$b[0]}
          sort {$$a[3] cmp $$b[3]} @S;

  formatTable(\@s, <<END,
Improvement     The improvement that might be made.
Text            The text that suggested the improvement.
Line_Number     The line number at which the improvement could be made.
Relative_Path   The file in which the improvement could be made.
Absolute_Path   The file in which the improvement could be made.
END
    title=>qq(Possible improvements),
    head=><<END,
Xref found NNNN opportunities for improvements that might be
made on DDDD
END
    file=>(fpe($xref->reports, qw(improvements txt))),
    summarize=>1);
 }

sub reportTopicDetails($)                                                       #P Things that occur once in each file
 {my ($xref) = @_;                                                              # Cross referencer

  my @t;
  for my $f(sort @{$xref->inputFiles})
   {push @t, [$xref->docType ->{$f}//q(),
              $xref->topicIds->{$f}//q(),
              $xref->author  ->{$f}//q(),
              $xref->title   ->{$f}//q(),
              $f,
             ];
   }

  formatFileNames($xref, \@t, 4);                                               # Format file names for easy use on unix and windows

  formatTable(\@t, <<END,
Tag             The outermost tag
Id              The id on the outermost tag
Author          The author of the topic
Title           The title of the topic
Relative_Path   The source file name as a relative file name
Absolute_Path   The source file name as an absolute file path
END
    title=>qq(Topics),
    head=><<END,
Xref found NNNN topics on DDDD
END
    file=>(fpe($xref->reports, qw(lists topics txt))),
    summarize=>1);
 }

sub reportTopicReuse($)                                                         #P Count how frequently each topic is reused
 {my ($xref) = @_;                                                              # Cross referencer

  my %t;
  for   my $f(sort keys %{$xref->topicRefs})
   {for my $t(sort keys %{$xref->topicRefs->{$f}})
     {my $file = absFromAbsPlusRel($f, $t);
      $t{$file}{$f}++;
     }
   }
  for my $t(keys %t)                                                            # Eliminate topicrefs that are used only once
   {if (keys (%{$t{$t}}) <= 1)
     {delete $t{$t};
     }
   }

  my @t;
  for   my $t(keys %t)                                                          # Target
   {for my $s(keys %{$t{$t}})                                                   # Source
     {push @t, [scalar(keys %{$t{$t}}), $t{$t}{$s},  $t, $s];
     }
   }

  my $t = [sort {$a->[0] <=> $b->[0]}                                           # Order report
           sort {$a->[2] cmp $b->[2]}  @t];

  for   my $i(keys @$t)                                                         # Deduplicate first column from third column
   {next unless $i;
    my $a = $t->[$i-1];
    my $b = $t->[$i];
    $b->[0] = '' if $a->[2] eq $b->[2];
   }

  formatTable($t,                                                               # Format report
               <<END,
Reuse           The number of times the target topic is reused over all topics
Count           The number of times the target topic is reused in the source topic
Target          The topic that is being reused == the target of reuse
Source          The topic that is referencing the reused topic
END
    title=>qq(Topic Reuses),
    head=><<END,
Xref found NNNN topics that are currently being reused on DDDD
END
    file=>(fpe($xref->reports, qw(lists topicReuse txt))),
    zero=>1,                                                                    # Reuse is very unlikely because the matching criteria is the MD5 sum
    summarize=>1);
 }

sub reportSimilarTopicsByTitle($)                                               #P Report topics likely to be similar on the basis of their titles as expressed in the non Guid part of their file names
 {my ($xref) = @_;                                                              # Cross referencer

  my %t;
  for   my $File(@{$xref->inputFiles})                                          # Each input file
   {my $F = fn $File;
    my $f = $F =~ s([0-9a-f]{32}\Z) (_)gsr;                                     # Remove md5 sum from file name
    $t{$f}{$F}++;
   }

  for my $t(keys %t)                                                            # Eliminate files that have no similar counter parts
   {if (keys (%{$t{$t}}) <= 1)
     {delete $t{$t};
     }
   }

  my @t;
  for   my $t(keys %t)                                                          # Target
   {for my $s(keys %{$t{$t}})                                                   # Source
     {push @t, [scalar(keys %{$t{$t}}), $t, $s];
     }
   }

  my $t = [sort {$b->[0] <=> $a->[0]}                                           # Order report so that most numerous are first
           sort {$a->[1] cmp $b->[1]}  @t];

  for   my $i(keys @$t)                                                         # Deduplicate first column from third column
   {next unless $i;
    my $a = $t->[$i-1];
    my $b = $t->[$i];
    $b->[0] = '' if $a->[1] eq $b->[1];
   }

  formatTable($t,                                                               # Format report
               <<END,
Similar          The number of topics similar to this one
Prefix           The prefix of the target file names being used for matching
Source           Topics that have the current prefix
END
    title=>qq(Topic Reuses),
    head=><<END,
Xref found NNNN topics that might be similar on DDDD
END
    file=>(fpe($xref->reports, qw(lists similar byTitle txt))),
    zero=>1,
    summarize=>1);
 }

sub reportSimilarTopicsByVocabulary($)                                          #P Report topics likely to be similar on the basis of their vocabulary
 {my ($xref) = @_;                                                              # Cross referencer

  my @m = grep {scalar(@$_) > 1}                                                # Partition into like topics based on vocabulary - select the partitions with more than one element
   setPartitionOnIntersectionOverUnionOfHashStringSets
    ($xref->matchTopics, $xref->vocabulary);

  my @t;
  for my $a(@m)
   {my ($first, @rest) = @$a;
    push @t, [scalar(@$a), $first], map {[q(), $_]} @rest;
    push @t, [q(), q()];
   }

  formatTable(\@t,                                                              # Format report
               <<END,
Similar          The number of similar topics in this block
Topic            One of the similar topics
END
    title=>qq(Topics with similar vocabulary),
    head=><<END,
Xref found NNNN topics that have similar vocabulary on DDDD
END
    file=>(my $f = fpe($xref->reports, qw(lists similar byVocabulary txt))));
 }

sub reportMd5Sum($)                                                             #P Good files have short names which uniquely represent their content and thus can be used instead of their md5sum to generate unique names
 {my ($xref) = @_;                                                              # Cross referencer

  my %f;                                                                        # {short file}{md5}++ means this short file name has the specified md5 sum.  We want there to be only one md5 sum per short file name
  for my $F(sort keys %{$xref->md5Sum})
   {if (my $m = $xref->md5Sum->{$F})
     {my $f = fn $F;
      $f{$f}{$m}++;
     }
   }

  for my $f(sort keys %f)                                                       # These are the good md5 sums that are in one-to-one correspondence with short file names
   {delete $f{$f} unless keys %{$f{$f}} == 1;
   }

  my @good;                                                                     # File name matches and md5 sum matches or opposite
  my @bad;                                                                      # Md5 sum matches but file name is not equal or file name is equal but md5 differs
  for my $F(sort keys %{$xref->md5Sum})
   {if (my $m = $xref->md5Sum->{$F})
     {my $f = fn $F;
      if ($f{$f}{$m})
       {push @good, [$m, $f, $F];
       }
      else
       {push @bad, [$m, $f, $F];
       }
     }
     ### Need check for undef $m
   }

  formatFileNames($xref, \@bad,  2);                                            # Format file names for easy use on unix and windows
  formatFileNames($xref, \@good, 2);

  formatTable(\@bad, <<END,
Md5_Sum           The md5 sum in question
Short_File_Name   The short name of the file
Relative_Path     The file name as a relative file name
Absolute_Path     The file name as an absolute file path
END
    title=>qq(Files whose short names are not bi-jective with their md5 sums),
    head=><<END,
Xref found NNNN such files on DDDD
END
    file=>(fpe($xref->reports, qw(bad shortNameToMd5Sum txt))),
    summarize=>1);

  formatTable(\@good, <<END,
Md5_Sum           The md5 sum in question
Short_File_Name   The short name of the file
Relative_Path     The file name as a relative file name
Absolute_Path     The file name as an absolute file path
END
    title=>qq(Files whose short names are bi-jective with their md5 sums),
    head=><<END,
Xref found NNNN such files on DDDD
END
    file=>(fpe($xref->reports, qw(good shortNameToMd5Sum txt))),
    summarize=>1);
 }

sub createSampleInputFiles($)                                                   #P Create sample input files for testing. The attribute B<inputFolder> supplies the name of the folder in which to create the sample files.
 {my ($N) = @_;                                                                 # Number of sample files
  my $in = q(in);
  clearFolder($in, 20);
  for my $n(1..$N)
   {my $o = $n + 1; $o -= $N if $o > $N;
    my $f = owf(fpe($in, $n, q(dita)), <<END);
<concept id="c$n">
  <title>Concept $n refers to $o</title>
  <conbody id="b$n">
     <xref id="x$n"  format="dita" href="$o.dita#c$o/x$o">Good</xref>
     <xref id="x$n"  format="dita" href="$o.dita#c$n/x$o">Duplicate id</xref>
     <xref id="b1$n" format="dita" href="bad$o.dita#c$o/x$o">Bad file</xref>
     <xref id="b2$n" format="dita" href="$o.dita#c$n/x$o">Bad topic id</xref>
     <xref id="b3$n" format="dita" href="$o.dita#c$o/x$n">Bad id in topic</xref>
     <xref id="g1$n" format="dita" href="$o.dita#c$o">Good 1</xref>
     <xref id="g2$n" format="dita" href="#c$o/x$o">Good 2</xref>
     <xref id="g3$n" format="dita" href="#c$o">Good 3</xref>
     <p conref="#c$n">Good conref</p>
     <p conref="#b$n">Bad conref</p>
     <image href="a$n.png"/>
     <image href="b$n.png"/>
  </conbody>
</concept>
END
   }

  owf(fpe($in, qw(act1 dita)), <<END);
<concept id="guid-000">
  <title id="title">All Timing Codes Begin Here</title>
  <author>Phil</author>
  <conbody>                                                  ~/r/aci/download/
    <p>Note: see below</p>
    <p>Important: ignore all notes above</p>
\    <image href="guid-000"/>
    <image href="guid-act1"/>
    <image href="guid-9999"/>
    <image href="act1.dita"/>
    <xref/>
  </conbody>
</concept>
END

  owf(fpe($in, qw(act2 dita)), <<END);
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
<concept id="c2">
  <title id="title">Jumping Through Hops</title>
  <conbody>
    <section>
      <title/>
      <xref  format="dita" href="act1.dita#c1/title"/>
      <note  conref="act2.dita#c2/title"/>
      <xref  format="dita" href="9999#c1/title"/>
      <xref  format="dita" href="guid-000#guid-000/title"/>
      <xref  format="dita" href="guid-000#guid-000/title2"/>
      <xref  format="dita" href="guid-000#c1/title2"/>
      <xref  format="dita" href="guid-999#c1/title2"/>
      <xref  href="http://"/>
      <image href="9999.png"/>
      <link href="guid-000"/>
      <link href="guid-999"/>
      <link href="act1.dita"/>
      <link href="act9999.dita"/>
      <p conref="9999.dita"/>
    </section>
    <required-cleanup>PLEX18</required-cleanup>
  </conbody>
</concept>
<!--linted: 2018-Nov-23 -->
END

  owf(fpe($in, qw(act3 dita)), <<END);
<concept id="c3">
  <title>Jumping Through Hops</title>
  <conbody>
    <p/>
  </body>
</concept>
END

  owf(fpe($in, qw(act4 dita)), <<END);
<concept id="c4">
  <taskbody/>
</concept>
END

  owf(fpe($in, qw(act5 dita)), <<END);
<concept id="c5">
  <taskbody/>
</concept>
END

  owf(fpe($in, qw(table dita)), <<END);
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd" []>
<concept id="table">
  <title>Tables</title>
  <conbody>
    <table>
      <tgroup cols="1">
        <thead>
          <row>
            <entry>
              <p>Significant Event</p>
            </entry>
            <entry>
              <p>Audit Event</p>
            </entry>
          </row>
        </thead>
        <tbody>
          <row>
            <entry/>
          </row>
        </tbody>
      </tgroup>
    </table>
    <table>
      <tgroup cols="1">
        <colspec/>
        <colspec/>
        <thead>
          <row>
            <entry>aaaa</entry>
            <entry>bbbb</entry>
          </row>
        </thead>
        <tbody>
          <row>
            <entry>aaaa</entry>
            <entry>bbbb</entry>
          </row>
          <row>
            <entry>aaaa</entry>
            <entry>bbbb</entry>
          </row>
        </tbody>
      </tgroup>
    </table>
  </conbody>
</concept>
END

  owf(fpe($in, qw(map bookmap ditamap)), <<END);
<map id="m1">
  <title>Test</title>
  <chapter  href="yyyy.dita">
    <topicref href="../act1.dita">Interesting topic</topicref>
    <topicref href="../act2.dita"/>
    <topicref href="../map/r.txt"/>
    <topicref href="9999.dita"/>
    <topicref href="bbb.txt"/>
    <topicref href="guid-000"/>
    <topicref href="guid-888"/>
    <topicref href="guid-999"/>
  </chapter>
</map>
END
  owf(fpe($in, qw(map bookmap2 ditamap)), <<END);
<map id="m2">
  <title>Test 2</title>
  <chapter  href="zzzz.dita">
    <topicref href="../act1.dita">Interesting topic</topicref>
    <topicref href="../act2.dita"/>
    <topicref href="../map/r.txt"/>
    <topicref href="9999.dita"/>
    <topicref href="bbb.txt"/>
    <topicref href="guid-000"/>
    <topicref href="guid-888"/>
    <topicref href="guid-999"/>
  </chapter>
</map>
END
  createEmptyFile(fpe($in, qw(a1 png)));
 }

#D
# podDocumentation
=pod

=encoding utf-8

=head1 Name

Data::Edit::Xml::Xref - Cross reference Dita XML, match topics and ameliorate missing references.

=head1 Synopsis

Check the references in a larghe corpus of Dita XML documents held in folder
L<inputFolder|/inputFolder> running processes in parallel where ever possible
to take advantage of multi-cpu computers:

  use Data::Edit::Xml::Xref;

  my $x = xref(inputFolder              => q(in),
               maximumNumberOfProcesses => 512,
               relativePath             => q(out),
               fixBadRefs               => 1,
               matchTopics              => 0.9,
              );

The cross reference analysis can be requested as a status line:

  ok nws($x->statusLine) eq nws(<<END);
Xref: 108 references fixed, 50 bad xrefs, 16 missing image files, 16 missing image references, 13 bad first lines, 13 bad second lines, 9 bad conrefs, 9 duplicate topic ids, 9 files with bad conrefs, 9 files with bad xrefs, 8 duplicate ids, 6 bad topicrefs, 6 files not referenced, 4 invalid guid hrefs, 2 bad book maps, 2 bad tables, 1 External xrefs with no format=html, 1 External xrefs with no scope=external, 1 file failed to parse, 1 href missing
END

Or as a tabular report:

  ok nws($x->statusTable) eq nws(<<END);
Xref:
    Count  Condition
 1    108  references fixed
 2     50  bad xrefs
 3     16  missing image files
 4     16  missing image references
 5     13  bad first lines
 6     13  bad second lines
 7      9  files with bad conrefs
 8      9  bad conrefs
 9      9  files with bad xrefs
10      9  duplicate topic ids
11      8  duplicate ids
12      6  bad topicrefs
13      6  files not referenced
14      4  invalid guid hrefs
15      2  bad book maps
16      2  bad tables
17      1  href missing
18      1  file failed to parse
19      1  External xrefs with no format=html
20      1  External xrefs with no scope=external
END

More detailed reports are produced in the L<reports|/reports> folder:

  $x->reports

and indexed by the reports report:

  reports/reports.txt

which contains a list of all the reports generated:

    Rows  Title                                                           File
 1     5  Attributes                                                      reports/count/attributes.txt
 2    13  Bad Xml line 1                                                  reports/bad/xmlLine1.txt
 3    13  Bad Xml line 2                                                  reports/bad/xmlLine2.txt
 4     9  Bad conRefs                                                     reports/bad/ConRefs.txt
 5     2  Bad external xrefs                                              reports/bad/externalXrefs.txt
 6    16  Bad image references                                            reports/bad/imageRefs.txt
 7     9  Bad topicrefs                                                   reports/bad/topicRefs.txt
 8    50  Bad xRefs                                                       reports/bad/XRefs.txt
 9     2  Bookmaps with errors                                            reports/bad/bookMap.txt
10     2  Document types                                                  reports/count/docTypes.txt
11     8  Duplicate id definitions within files                           reports/bad/idDefinitionsDuplicated.txt
12     3  Duplicate topic id definitions                                  reports/bad/topicIdDefinitionsDuplicated.txt
13     3  File extensions                                                 reports/count/fileExtensions.txt
14     1  Files failed to parse                                           reports/bad/parseFailed.txt
15     0  Files types                                                     reports/count/fileTypes.txt
16    16  Files whose short names are bi-jective with their md5 sums      reports/good/shortNameToMd5Sum.txt
17     0  Files whose short names are not bi-jective with their md5 sums  reports/bad/shortNameToMd5Sum.txt
18   108  Fixes Applied To Failing References                             reports/lists/referencesFixed.txt
19     0  Good bookmaps                                                   reports/good/bookMap.txt
20     9  Good conRefs                                                    reports/good/ConRefs.txt
21     5  Good topicrefs                                                  reports/good/topicRefs.txt
22     8  Good xRefs                                                      reports/good/XRefs.txt
23     1  Guid topic definitions                                          reports/lists/guidsToFiles.txt
24     2  Image files                                                     reports/good/imagesFound.txt
25     1  Missing hrefs                                                   reports/bad/missingHrefAttributes.txt
26    16  Missing image references                                        reports/bad/imagesMissing.txt
27     4  Possible improvements                                           reports/improvements.txt
28     2  Resolved GUID hrefs                                             reports/good/guidHrefs.txt
29     2  Tables with errors                                              reports/bad/tables.txt
30    23  Tags                                                            reports/count/tags.txt
31    11  Topic Reuses                                                    reports/lists/topicReuse.txt
32     0  Topic Reuses                                                    reports/lists/similar/byTitle.txt
33    16  Topics                                                          reports/lists/topics.txt
34    15  Topics with similar vocabulary                                  reports/lists/similar/byVocabulary.txt
35     0  Topics with validation errors                                   reports/bad/validationErrors.txt
36     0  Topics without ids                                              reports/bad/topicIdDefinitionsMissing.txt
37     6  Unreferenced files                                              reports/bad/notReferenced.txt
38    11  Unresolved GUID hrefs                                           reports/bad/guidHrefs.txt

File names in reports can be made relative to a specified directory named on the

  relativePath => q(out)

attribute.

=head2 Fix bad references

It is often desirable to ameliorate unresolved Dita href attributes so that
incomplete content can be loaded into a content management system.  The

  fixBadRefs => 1

attribute requests that the:

 href

attribute be renamed to:

 xtrf

if the href attribute specification cannt be resolved in the current corpus.

=head2 Topic Matching

Topics can be matched on title and vocabulary to assist authors in finding
similar topics by specifying the:

  matchTopics => 0.9

attribute where the value of this attribute is the confidence level between 0
and 1.

Topic matching might take some time for large input folders.

=head3 Title matching

Title sorts topics by their titles so that topic with similar titles can be
easily located:

    Similar  Prefix        Source
 1       14  c_Notices__   c_Notices_5614e96c7a3eaf3dfefc4a455398361b
 2           c_Notices__   c_Notices_14a9f467215dea879d417de884c21e6d
 3           c_Notices__   c_Notices_19011759a2f768d76581dc3bba170a44
 4           c_Notices__   c_Notices_aa741e6223e6cf8bc1a5ebdcf0ba867c
 5           c_Notices__   c_Notices_f0009b28c3c273094efded5fac32b83f
 6           c_Notices__   c_Notices_b1480ac1af812da3945239271c579bb1
 7           c_Notices__   c_Notices_5f3aa15d024f0b6068bd8072d4942f6d
 8           c_Notices__   c_Notices_17c1f39e8d70c765e1fbb6c495bedb03
 9           c_Notices__   c_Notices_7ea35477554f979b3045feb369b69359
10           c_Notices__   c_Notices_4f200259663703065d247b35d5500e0e
11           c_Notices__   c_Notices_e3f2eb03c23491c5e96b08424322e423
12           c_Notices__   c_Notices_06b7e9b0329740fc2b50fedfecbc5a94
13           c_Notices__   c_Notices_550a0d84dfc94982343f58f84d1c11c2
14           c_Notices__   c_Notices_fa7e563d8153668db9ed098d0fe6357b
15        3  c_Overview__  c_Overview_f9e554ee9be499368841260344815f58
16           c_Overview__  c_Overview_f234dc10ea3f4229d0e1ab4ad5e8f5fe
17           c_Overview__  c_Overview_96121d7bcd41cf8be318b96da0049e73


=head3 Vocabulary matching

Vocabulary matching compares the vocabulary of pairs of topics: topics with
similar vocabularies within the confidence level specified are reported
together:

    Similar  Topic
 1        8  in/1.dita
 2           in/2.dita
 3           in/3.dita
 4           in/4.dita
 5           in/5.dita
 6           in/6.dita
 7           in/7.dita
 8           in/8.dita
 9
10        2  in/map/bookmap.ditamap
11           in/map/bookmap2.ditamap
12
13        2  in/act4. dita
14           in/act5.dita

=head1 Description

Cross reference Dita XML, match topics and ameliorate missing references.


Version 20190131.


The following sections describe the methods in each functional area of this
module.  For an alphabetic listing of all methods by name see L<Index|/Index>.



=head1 Cross reference

Check the cross references in a set of Dita files and report the results.

=head2 xref(%)

Check the cross references in a set of Dita files held in  L<inputFolder|/inputFolder> and report the results in the L<reports|/reports> folder. The possible attributes are defined in L<Data::Edit::Xml::Xref|/Data::Edit::Xml::Xref>

     Parameter    Description
  1  %attributes  Attributes

B<Example:>


    my $N = 8;                                                                    
  


=head2 Data::Edit::Xml::Xref Definition


Attributes used by the Xref cross referencer.




=head3 Input fields


B<fixBadRefs> - Try to fix bad references in L<these files|/fixRefs> where possible by either changing a guid to a file name assuming the right file is present or failing that by moving the failing reference to the "xtrf" attribute.

B<inputFolder> - A folder containing the dita and ditamap files to be cross referenced.

B<matchTopics> - Match topics by title and by vocabulary to the specified confidence level between 0 and 1.  This operation might take some time to complete on a large corpus.

B<maximumNumberOfProcesses> - Maximum number of processes to run in parallel at any one time.

B<relativePath> - Report files relative to this path or absolutely if undefined.

B<reports> - Reports folder: the cross referencer will write reports to files in this folder.

B<summary> - Print the summary line.



=head3 Output fields


B<attributeCount> - {file}{attribute} == count of the different xml attributes found in the xml files.

B<author> - {file} = author of this file.

B<badBookMaps> - Bad book maps.

B<badConRefs> - {sourceFile} = [file, href] indicating the file has at least one bad conref.

B<badConRefsList> - Bad conrefs - by file.

B<badGuidHrefs> - Bad conrefs - all.

B<badImageRefs> - Consolidated images missing.

B<badTables> - Array of tables that need fixing.

B<badTopicRefs> - [file, href]   Invalid href attributes found on topicref tags.

B<badXRefs> - Bad Xrefs - by file

B<badXRefsList> - Bad Xrefs - all

B<badXml1> - [Files] with a bad xml encoding header on the first line.

B<badXml2> - [Files] with a bad xml doc type on the second line.

B<conRefs> - {file}{href}   Count of conref definitions in each file.

B<docType> - {file} == docType:  the docType for each xml file.

B<duplicateIds> - [file, id]     Duplicate id definitions within each file.

B<duplicateTopicIds> - [topicId, [files]] Files with duplicate topic ids - the id on the outermost tag.

B<fileExtensions> - Default file extensions to load

B<fixRefs> - {file}{ref} where the href or conref target is not present.

B<fixedRefs> - [] hrefs and conrefs from L<fixRefs|/fixRefs which have been ameliorated where possible by either changing a guid to a file name assuming the right file is present or failing that by moving the failing reference to the "xtrf" attribute.

B<goodBookMaps> - Good book maps.

B<goodConRefs> - Good con refs - by file.

B<goodConRefsList> - Good con refs - all.

B<goodGuidHrefs> - {file}{href}{location}++ where a href that starts with GUID- has been correctly resolved.

B<goodImageRefs> - Consolidated images found.

B<goodTopicRefs> - Good topic refs.

B<goodXRefs> - Good xrefs - by file.

B<goodXRefsList> - Good xrefs - all.

B<guidHrefs> - {file}{href} = location where href starts with GUID- and is thus probably a guid.

B<guidToFile> - {topic id which is a guid} = file defining topic id.

B<ids> - {file}{id}     Id definitions across all files.

B<images> - {file}{href}   Count of image references in each file.

B<improvements> - Suggested improvements - a list of improvements that might be made.

B<inputFiles> - Input files from L<inputFolder|/inputFolder>.

B<inputFolderImages> - {filename} = full file name which works well for images because the md5 sum in their name is probably unique.

B<md5Sum> - MD5 sum for each input file.

B<missingImageFiles> - [file, href] == Missing images in each file.

B<missingTopicIds> - Missing topic ids.

B<noHref> - Tags that should have an href but do not have one.

B<notReferenced> - Files in input area that are not referenced by a conref, image, topicref or xref tag and are not a bookmap.

B<parseFailed> - [file] files that failed to parse.

B<results> - Summary of results table.

B<sourceFile> - The source file from which this structure was generated.

B<statusLine> - Status line summarizing the cross reference.

B<statusTable> - Status table summarizing the cross reference.

B<tagCount> - {file}{tags} == count of the different tag names found in the xml files.

B<title> - {file} = title of file.

B<topicIds> - {file} = topic id - the id on the outermost tag.

B<topicRefs> - {file}{href}++ References from bookmaps to topics via appendix, chapter, topicref.

B<validationErrors> - True means that Lint detected errors in the xml contained in the file.

B<vocabulary> - The text of each topic shorn of attributes for vocabulary comparison.

B<xRefs> - {file}{href}++ Xrefs references.

B<xrefBadFormat> - External xrefs with no format=html.

B<xrefBadScope> - External xrefs with no scope=external.



=head1 Attributes


The following is a list of all the attributes in this package.  A method coded
with the same name in your package will over ride the method of the same name
in this package and thus provide your value for the attribute in place of the
default value supplied for this attribute by this package.

=head2 Replaceable Attribute List


improvementLength 


=head2 improvementLength

Improvement length




=head1 Private Methods

=head2 lll(@)

Write a message

     Parameter  Description
  1  @m         Message text

=head2 countLevels($$)

Count has elements to the specified number of levels

     Parameter  Description
  1  $l         Levels
  2  $h         Hash

=head2 relativeFilePath($$)

Format file name for easy use on windows

     Parameter  Description
  1  $xref      Xref
  2  $file      File

=head2 unixFile($$)

Format file name for easy use on unix

     Parameter  Description
  1  $xref      Xref
  2  $file      File

=head2 formatFileNames($$$)

Format file names for easy use on unix and windows

     Parameter  Description
  1  $xref      Xref
  2  $array     Array of arrays containing file names in unix format
  3  $column    Column containing file names

=head2 loadInputFiles($)

Load the names of the files to be processed

     Parameter  Description
  1  $xref      Cross referencer

=head2 analyzeOneFile($)

Analyze one input file

     Parameter  Description
  1  $iFile     File to analyze

=head2 reportGuidsToFiles($)

Map and report guids to files

     Parameter  Description
  1  $xref      Xref results

=head2 fixOneFile($$)

Fix one file by moving unresolved references to the xtrf attribute

     Parameter  Description
  1  $xref      Xref results
  2  $file      File to fix

=head2 fixFiles($)

Fix files by moving unresolved references to the xtrf attribute

     Parameter  Description
  1  $xref      Xref results

=head2 analyze($)

Analyze the input files

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportDuplicateIds($)

Report duplicate ids

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportDuplicateTopicIds($)

Report duplicate topic ids

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportNoHrefs($)

Report locations where an href was expected but not found

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportRefs($$)

Report bad references found in xrefs or conrefs as they have the same structure

     Parameter  Description
  1  $xref      Cross referencer
  2  $type      Type of reference to be processed

=head2 reportGuidHrefs($)

Report on guid hrefs

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportXrefs($)

Report bad xrefs

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportTopicRefs($)

Report bad topic refs

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportConrefs($)

Report bad conrefs refs

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportImages($)

Reports on images and references to images

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportParseFailed($)

Report failed parses

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportXml1($)

Report bad xml on line 1

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportXml2($)

Report bad xml on line 2

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportDocTypeCount($)

Report doc type count

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportTagCount($)

Report tag counts

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportAttributeCount($)

Report attribute counts

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportValidationErrors($)

Report the files known to have validation errors

     Parameter  Description
  1  $xref      Cross referencer

=head2 checkBookMap($$)

Check whether a bookmap is valid or not

     Parameter  Description
  1  $xref      Cross referencer
  2  $bookMap   Bookmap

=head2 reportBookMaps($)

Report on whether each bookmap is good or bad

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportTables($)

Report on tables that have problems

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportFileExtensionCount($)

Report file extension counts

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportFileTypes($)

Report file type counts - takes too long in series

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportNotReferenced($)

Report files not referenced by any of conref, image, topicref, xref and are not bookmaps.

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportExternalXrefs($)

Report external xrefs missing other attributes

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportPossibleImprovements($)

Report improvements possible

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportTopicDetails($)

Things that occur once in each file

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportTopicReuse($)

Count how frequently each topic is reused

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportSimilarTopicsByTitle($)

Report topics likely to be similar on the basis of their titles as expressed in the non Guid part of their file names

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportSimilarTopicsByVocabulary($)

Report topics likely to be similar on the basis of their vocabulary

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportMd5Sum($)

Good files have short names which uniquely represent their content and thus can be used instead of their md5sum to generate unique names

     Parameter  Description
  1  $xref      Cross referencer

=head2 createSampleInputFiles($)

Create sample input files for testing. The attribute B<inputFolder> supplies the name of the folder in which to create the sample files.

     Parameter  Description
  1  $N         Number of sample files


=head1 Index


1 L<analyze|/analyze> - Analyze the input files

2 L<analyzeOneFile|/analyzeOneFile> - Analyze one input file

3 L<checkBookMap|/checkBookMap> - Check whether a bookmap is valid or not

4 L<countLevels|/countLevels> - Count has elements to the specified number of levels

5 L<createSampleInputFiles|/createSampleInputFiles> - Create sample input files for testing.

6 L<fixFiles|/fixFiles> - Fix files by moving unresolved references to the xtrf attribute

7 L<fixOneFile|/fixOneFile> - Fix one file by moving unresolved references to the xtrf attribute

8 L<formatFileNames|/formatFileNames> - Format file names for easy use on unix and windows

9 L<lll|/lll> - Write a message

10 L<loadInputFiles|/loadInputFiles> - Load the names of the files to be processed

11 L<relativeFilePath|/relativeFilePath> - Format file name for easy use on windows

12 L<reportAttributeCount|/reportAttributeCount> - Report attribute counts

13 L<reportBookMaps|/reportBookMaps> - Report on whether each bookmap is good or bad

14 L<reportConrefs|/reportConrefs> - Report bad conrefs refs

15 L<reportDocTypeCount|/reportDocTypeCount> - Report doc type count

16 L<reportDuplicateIds|/reportDuplicateIds> - Report duplicate ids

17 L<reportDuplicateTopicIds|/reportDuplicateTopicIds> - Report duplicate topic ids

18 L<reportExternalXrefs|/reportExternalXrefs> - Report external xrefs missing other attributes

19 L<reportFileExtensionCount|/reportFileExtensionCount> - Report file extension counts

20 L<reportFileTypes|/reportFileTypes> - Report file type counts - takes too long in series

21 L<reportGuidHrefs|/reportGuidHrefs> - Report on guid hrefs

22 L<reportGuidsToFiles|/reportGuidsToFiles> - Map and report guids to files

23 L<reportImages|/reportImages> - Reports on images and references to images

24 L<reportMd5Sum|/reportMd5Sum> - Good files have short names which uniquely represent their content and thus can be used instead of their md5sum to generate unique names

25 L<reportNoHrefs|/reportNoHrefs> - Report locations where an href was expected but not found

26 L<reportNotReferenced|/reportNotReferenced> - Report files not referenced by any of conref, image, topicref, xref and are not bookmaps.

27 L<reportParseFailed|/reportParseFailed> - Report failed parses

28 L<reportPossibleImprovements|/reportPossibleImprovements> - Report improvements possible

29 L<reportRefs|/reportRefs> - Report bad references found in xrefs or conrefs as they have the same structure

30 L<reportSimilarTopicsByTitle|/reportSimilarTopicsByTitle> - Report topics likely to be similar on the basis of their titles as expressed in the non Guid part of their file names

31 L<reportSimilarTopicsByVocabulary|/reportSimilarTopicsByVocabulary> - Report topics likely to be similar on the basis of their vocabulary

32 L<reportTables|/reportTables> - Report on tables that have problems

33 L<reportTagCount|/reportTagCount> - Report tag counts

34 L<reportTopicDetails|/reportTopicDetails> - Things that occur once in each file

35 L<reportTopicRefs|/reportTopicRefs> - Report bad topic refs

36 L<reportTopicReuse|/reportTopicReuse> - Count how frequently each topic is reused

37 L<reportValidationErrors|/reportValidationErrors> - Report the files known to have validation errors

38 L<reportXml1|/reportXml1> - Report bad xml on line 1

39 L<reportXml2|/reportXml2> - Report bad xml on line 2

40 L<reportXrefs|/reportXrefs> - Report bad xrefs

41 L<unixFile|/unixFile> - Format file name for easy use on unix

42 L<xref|/xref> - Check the cross references in a set of Dita files held in  L<inputFolder|/inputFolder> and report the results in the L<reports|/reports> folder.

=head1 Installation

This module is written in 100% Pure Perl and, thus, it is easy to read,
comprehend, use, modify and install via B<cpan>:

  sudo cpan install Data::Edit::Xml::Xref

=head1 Author

L<philiprbrenan@gmail.com|mailto:philiprbrenan@gmail.com>

L<http://www.appaapps.com|http://www.appaapps.com>

=head1 Copyright

Copyright (c) 2016-2018 Philip R Brenan.

This module is free software. It may be used, redistributed and/or modified
under the same terms as Perl itself.

=cut



# Tests and documentation

sub test
 {my $p = __PACKAGE__;
  binmode($_, ":utf8") for *STDOUT, *STDERR;
  return if eval "eof(${p}::DATA)";
  my $s = eval "join('', <${p}::DATA>)";
  $@ and die $@;
  eval $s;
  $@ and die $@;
  1
 }

test unless caller;

1;
# podDocumentation
__DATA__
use warnings FATAL=>qw(all);
use strict;
use Test::More tests=>1;

my $windows = $^O =~ m(MSWin32)is;
my $mac     = $^O =~ m(darwin)is;

Test::More->builder->output("/dev/null")                                        # Show only errors during testing
  if ((caller(1))[0]//'Data::Edit::Xml::Xref') eq "Data::Edit::Xml::Xref";

if (!$windows) {

if (1) {
  my $N = 8;                                                                    #Txref

  clearFolder(q(reports), 42);
  createSampleInputFiles($N);

  my $x = xref(inputFolder              => q(in),
               fixBadRefs               => 1,
               maximumNumberOfProcesses => 2,
               matchTopics              => 0.9,
               relativePath             => q(in));

  ok nws($x->statusLine) eq nws(<<END);
Xref: 108 references fixed, 50 bad xrefs, 16 missing image files, 16 missing image references, 13 bad first lines, 13 bad second lines, 9 bad conrefs, 9 duplicate topic ids, 9 files with bad conrefs, 9 files with bad xrefs, 8 duplicate ids, 6 bad topicrefs, 6 files not referenced, 4 invalid guid hrefs, 2 bad book maps, 2 bad tables, 1 External xrefs with no format=html, 1 External xrefs with no scope=external, 1 file failed to parse, 1 href missing
END

  say STDERR $x->statusTable;
# say STDERR "AAAA ", dump($x->notReferenced);
 }

 }
else
 {ok 1 for 1..1;
 }

1


