#! /usr/bin/perl -w
use lib '/usr/lib/perl'; use INN::Config;
use strict;

# @(#)scanspool 1.20 4/6/92 00:47:35
#
# Written by: Landon Curt Noll (chongo was here  /\../\)
#
# This code is placed in the public domain.
#
# NOTE: This take a while, -v is a good thing if you want to know
#       how far this program has progressed.
#
# This program will scan first the active file, noting problems such as:
#
#       malformed line
#       group aliased to a non-existent group
#       group aliased to a group that is also aliased
#
# Then it will examine all articles under your news spool directory,
# looking for articles that:
#
#       basename that starts with a leading 0
#       basename that is out of range with the active file
#       does not contain a Newsgroups header field
#       article that is all header and no text
#       is in a directory for which there is no active group
#       article that is in a group to which it does not belong
#
# Scanspool understands aliased groups.  Thus, if an article is posted
# to foo.old.name that is aliased to foo.bar, it will be expected to
# be found under foo.bar and not foo.old.name.
#
# Any group that is of type 'j' or 'x' (4th field of the active file)
# will be allowed to show up under the junk group.
#
# Scanspool assumes that the path of a valid newsgroup's directory
# from the top of the spool tree will not contain any "." character.
# Thus, directories such as out.going, tmp.dir, in.coming and
# news.archive will not be searched.  This program also assumes that
# article basenames contain only decimal digits.  Last, files under
# the top level directory "lost+found" are not scanned.
#
# The output of scanspool will start with one of 4 forms:
#
#    FATAL:         fatal or internal error                     (to stderr)
#
#    WARN:          active or article format problem,           (to stderr)
#                   group alias problem, find error,
#                   article open error
#
#    path/123:      basename starts with 0,                     (to stdout)
#                   article number out of range,
#                   article in the wrong directory,
#                   article in directory not related to
#                   an active non-aliases newsgroup
#
#    \t ...         verbose message starting with a tab         (to stdout)

# Data structures
#
my %gname2type;
# $gname2type{$name}
#    $name      - newsgroup name in foo.dot.form
#    produces  => 4th active field  (y, n, x, ...)
#                 alias type is "=", not "=foo.bar"
#
my %realgname;
# $realgname{$name}
#    $name      - newsgroup name in foo.dot.form
#    produces  => newsgroup name in foo.dot.form
#                 if type is =, this will be a.b, not $name
#
my %lowart;
# $lowart{$name}
#    $name      - newsgroup name in foo.dot.form
#    produces  => lowest article allowed in the group
#                 if type is =, this is not valid
#
my %highart;
# $highart{$name}
#    $name      - newsgroup name in foo.dot.form
#    produces  => highest article allowed in the group
#                 if type is =, this is not valid
#                 If $highart{$name} < $lowart{$name},
#                 then the group should be empty

# perl requirements
#
use Getopt::Std;

# Define variables Getopt::Std uses for --help and --version.
$Getopt::Std::STANDARD_HELP_VERSION = 1;
our $VERSION = $INN::Config::version;
$VERSION =~ s/INN //;

$0 =~ s!.*/!!;

my $usage = "Usage:
  $0 [-cnv] [-a active-file] [-s spool-dir]

  Perform a sanity scan over all articles in <patharticles>

Options:
  -a active-file     active file to use (default <pathdb>/active)
  -c                 check article filenames, don't scan the articles
  -n                 don't throttle innd
  -s spool-dir       spool tree (default <patharticles>)
  -v                 verbose mode
                     verbose messages begin with a tab
                     show articles found in non-active directories
";

sub HELP_MESSAGE {
    print $usage;
    exit(0);
}

# global constants
#
my $prog = $0;                       # our name
my $spool = "$INN::Config::patharticles";
my $active = "$INN::Config::active";
my $ctlinnd = "$INN::Config::pathbin/ctlinnd";
my $reason = "running scanspool";    # throttle reason

# parse args
#
my %opt;
getopts("a:s:vcn", \%opt) || die $usage;
$active = $opt{'a'} if defined $opt{'a'};
$spool = $opt{'s'} if defined $opt{'s'};

# throttle innd unless -n
#
if (!defined $opt{'n'}) {
    system("$ctlinnd throttle '$reason' >/dev/null 2>&1");
}

# process the active file
#
parse_active($active);

# check the spool directory
#
check_spool($spool);

# unthrottle innd unless -n
#
if (!defined $opt{'n'}) {
    system("$ctlinnd go '$reason' >/dev/null 2>&1");
}

# all done
exit(0);

# parse_active - parse the active file
#
# From the active file, fill out the %gname2type (type of newsgroup)
# and %realgname (real/non-aliased name of group), %lowart & %highart
# (low and high article numbers).  This routine will also check for
# aliases to missing groups or groups that are also aliases.
#
sub parse_active {
    my ($active) = @_;    # the name of the active file to use
    my $ACTIVE;           # active file handle
    my $line;             # active file line
    my $name;             # name of newsgroup
    my $low;              # low article number
    my $high;             # high article number
    my $type;             # type of newsgroup (4th active field)
    my $alias;            # realname of an aliased group
    my $linenum;          # active file line number

    # if verbose (-v), say what we are doing
    print "\tscanning $active\n" if defined $opt{'v'};

    # open the active file
    open($ACTIVE, '<', $active) || fatal(1, "cannot open $active");

    # parse each line
    $linenum = 0;
    while ($line = <$ACTIVE>) {

        # count the line
        ++$linenum;

        # verify that we have a correct number of tokens
        if ($line !~ /^\S+ 0*(\d+) 0*(\d+) \S+$/o) {
            problem("WARNING: active line is mal-formed at line $linenum");
            next;
        }
        ($name, $high, $low, $type)
          = $line =~ /^(\S+) 0*(\d+) 0*(\d+) (\S+)$/o;

        # watch for duplicate entries
        if (defined $realgname{$name}) {
            problem("WARNING: ignoring duplicate group: $name"
                  . ", at active line $linenum");
            next;
        }

        # record which type it is
        $gname2type{$name} = $type;

        # record the low and high article numbers
        $lowart{$name} = $low;
        $highart{$name} = $high;

        # determine the directory and real group name
        if ($type eq "j" || $type eq "x") {
            $alias = $name;
        } elsif ($type =~ /^=(.+)/o) {
            $alias = $1;
            $gname2type{$name} = "=";    # rename type to be just =
        } else {
            $alias = $name;
        }
        $realgname{$name} = $alias;
    }

    # close the active file
    close $ACTIVE;

    # be sure that any alias type is aliased to a real group
    foreach my $name (keys %realgname) {

        # skip if not an alias type
        next if $gname2type{$name} ne "=";

        # be sure that the alias exists
        $alias = $realgname{$name};
        if (!defined $realgname{$alias}) {
            problem("WARNING: alias for $name: $alias, is not a group");
            next;
        }

        # be sure that the alias is not an alias of something else
        if ($gname2type{$alias} eq "=") {
            problem("WARNING: alias for $name: $alias, is also an alias");
            next;
        }
    }
    return;
}

# problem - report a problem to stdout
#
# Print a message to stdout.  Parameters are space separated.
# A final newline is appended to it.
#
# usage:
#       problem(arg, arg2, ...)
#
sub problem {
    # print the line with the header field and newline
    my $line = join(" ", @_);
    print STDERR $line, "\n";
    return;
}

# fatal - report a fatal error to stderr and exit
#
# Print a message to stderr.  The message has the program name prepended
# to it.  Parameters are space separated.  A final newline is appended
# to it.  This function exits with the code of exitval.
#
# usage:
#       fatal(exitval, arg, arg2, ...)
#
sub fatal {
    my ($exitval, @args) = @_;

    # firewall, in case we're called with less than two arguments
    if ($#_ < 1) {
        print STDERR "FATAL: fatal() called with only ", $#_ + 1,
          " arguments\n";
        if ($#_ < 0) {
            $exitval = -1;
        }
    }

    # print the error message
    my $line = join(" ", @args);
    print STDERR "$prog: $line\n";

    # unthrottle innd unless -n
    #
    if (!defined $opt{'n'}) {
        system("$ctlinnd go '$reason' >/dev/null 2>&1");
    }

    # exit
    exit $exitval;
}

# check_spool - check the articles found in the spool directory
#
# This subroutine will check all articles found under the $spool directory.
# It will examine only file path that do not contain any "." or whitespace
# character, and whose basename is completely numeric.  Files under
# lost+found will also be ignored.
#
# given:
#       $spooldir  - top of <patharticles> tree
#
sub check_spool {
    my ($spooldir) = @_;    # top of article tree
    my $filename;           # article pathname under $spool
    my $artgrp;             # group of an article
    my $artnum;             # article number in a group
    my $prevgrp = '';       # previous different value of $artgrp
    my $preverrgrp = '';    # previous non-active $artgrp
    my $ARTICLE;            # article handle
    my $aline;              # header line from an article
    my @group;              # array of groups from the Newsgroups header field
    my $FINDFILE;           # find command pipe handle

    # if verbose, say what we are doing
    print "\tfinding articles under $spooldir\n" if defined $opt{'v'};

    # move to the $spool directory
    chdir $spooldir or fatal(2, "cannot chdir to $spool");

    # start finding files
    #
    if (
        !open $FINDFILE, '-|',
        "find . \\( -type f -o -type l \\) -name '[0-9]*' -print 2>&1"
    ) {
        fatal(3, "cannot start find in $spool");
    }

    # process each history line
    #
    while ($filename = <$FINDFILE>) {

        # if the line contains find:, assume it is a find error and print it
        chomp($filename);
        if ($filename =~ /find:\s/o) {
            problem("WARNING:", $filename);
            next;
        }

        # remove the ./ that find put in our path
        $filename =~ s#^\./##o;

        # skip if this path has a . in it (beyond a leading ./)
        next if ($filename =~ /\./o);

        # skip if lost+found
        next if ($filename =~ m:^lost+found/:o);

        # skip if not a numeric basename
        next if ($filename !~ m:/\d+$:o);

        # get the article's newsgroup name (based on its path from $spool)
        $artgrp = $filename;
        $artgrp =~ s#/\d+$##o;
        $artgrp =~ s#/#.#go;

        # if verbose (-v), then note if our group changed
        if (defined $opt{'v'} && $artgrp ne $prevgrp) {
            print "\t$artgrp\n";
            $prevgrp = $artgrp;
        }

        # note if the article is not in a directory that is used by
        # a real (non-aliased) group in the active file
        #
        # If we complained about this group before, don't complain again.
        # If verbose, note files that could be removed.
        #
        if (!defined $gname2type{$artgrp} || $gname2type{$artgrp} =~ /[=jx]/o)
        {
            if ($preverrgrp ne $artgrp) {
                problem("$artgrp: not an active group directory");
                $preverrgrp = $artgrp;
            }
            if (defined $opt{'v'}) {
                problem("$filename: article found in non-active directory");
            }
            next;
        }

        # check on the article number
        $artnum = $filename;
        $artnum =~ s#^.+/##o;
        if ($artnum =~ m/^0/o) {
            problem("$filename: article basename starts with a 0");
        }
        if (defined $gname2type{$artgrp}) {
            if ($lowart{$artgrp} > $highart{$artgrp}) {
                problem("$filename: active indicates group should be empty");
            } elsif ($artnum < $lowart{$artgrp}) {
                problem("$filename: article number is too low");
            } elsif ($artnum > $highart{$artgrp}) {
                problem("$filename: article number is too high");
            }
        }

        # if check filenames only (-c), then do nothing else with the file
        next if (defined $opt{'c'});

        # don't open a control or junk, they can be from anywhere
        next
          if ($artgrp eq "control"
              || $artgrp =~ /^control\./
              || $artgrp eq "junk");

        # try open the file
        if (!open $ARTICLE, '<', $filename) {

            # the find is now gone (expired?), give up on it
            problem("WARNING: cannot open $filename");
            next;
        }

        # read until the Newsgroups header field is found
      AREADLINE:
        while ($aline = <$ARTICLE>) {

            # catch the Newsgroups header field
            if ($aline =~ /^Newsgroups:\w*\W/io) {
                # convert $aline into a comma separated list of groups
                # remove both CR and LF from the header field body (because
                # the article may be stored in wire-format).
                $aline =~ s/^Newsgroups://io;
                $aline =~ tr/ \t\r\n//d;

                # form an array of news groups
                @group = split(",", $aline);

                # see if any groups in the Newsgroups header field
                # are our group
                for (my $j = 0; $j <= $#group; ++$j) {

                    # look at the group
                    if (exists $realgname{ $group[$j] }
                        and $realgname{ $group[$j] } eq $artgrp)
                    {
                        # this article was posted to this group
                        last AREADLINE;
                    }
                }

                # no group or group alias was found
                problem("$filename: does not belong in $artgrp"
                      . " according to its Newsgroups header field");
                last;

                # else watch for the end of the header field
            } elsif ($aline =~ /^\s*$/o) {

                # no Newsgroups header field found
                problem("WARNING: $filename: no Newsgroups header field");
                last;
            }
            if (eof $ARTICLE) {
                problem("WARNING: $filename: EOF found while reading headers");
            }
        }

        # close the article
        close $ARTICLE;
    }

    # all done with the find
    close $FINDFILE;
    return;
}
