#!/usr/bin/perl
#
# weblogin-passcheck -- Check Apache access logs for disclosed passwords.
#
# Takes as input Apache access logs, possibly preceded by the filename and a
# colon.  Parse each line to see if it contains a login via GET, and if so,
# store the username, user agent, and date.

##############################################################################
# Modules and globals
##############################################################################

require 5.006;
use strict;
use warnings;

use Date::Parse qw(str2time);
use Getopt::Long qw(GetOptions);
use POSIX qw(strftime);

# Grouping threshold for incidents.  Above this count of authentications, the
# number of affected users will be grouped together.
my $COUNT_THRESHOLD = 5;

# Components of an Apache access log.
my $VHOST_REGEX  = qr{ (?: [\w._-]+ : )? }xms;
my $CLIENT_REGEX = qr{ ( [\d.]+ )        }xms;
my $AUTH_REGEX   = qr{ \S+ [ ] \S+       }xms;
my $TIME_REGEX   = qr{ \[ ([^\]]+) \]    }xms;
my $STRING_REGEX = qr{ \" ([^\"]+) \"    }xms;
my $STATUS_REGEX = qr{ (\d+)             }xms;

# Regex to match a single line of Apache access log output.
my $LOG_REGEX = qr{
    ^
    $VHOST_REGEX                # optional virtual host
    $CLIENT_REGEX \s            # client IP address (1)
    $AUTH_REGEX \s              # authentication information
    $TIME_REGEX \s              # timestamp (2)
    $STRING_REGEX \s            # query (3)
    $STATUS_REGEX \s            # HTTP status (4)
    \S+                         # size
    (?:                         # look for user agent (optional)
      [ ]
      $STRING_REGEX \s          # referer (5)
      $STRING_REGEX             # user agent (6)
    )?
    \s* \z
}xms;

# Regex to match a single query that we care about.
my $QUERY_REGEX = qr{
    ^
    GET \s+                     # Only match GET; POST didn't have a problem
    /login(?:-spnego)?          # Queries to the login form
    /*
    ( [^/\s] \S* )              # Query parameters (1)
}xms;

##############################################################################
# Utility functions
##############################################################################

# Decode a single CGI parameter.  Helper function for decode_params.
#
# $param - Encoded CGI parameter
#
# Returns: key and value as a list
sub decode_param {
    my ($param) = @_;
    my ($key, $value) = split(m{=}xms, $param, 2);
    $value =~ s{ % (..) }{ chr(hex($1)) }xmsge;
    return ($key, $value);
}

# Parse the CGI parameters from a URL into a hash.
#
# $params - CGI parameters from a URL as a string
#
# Returns a reference to a hash of parameter keys and values.
sub decode_params {
    my ($params) = @_;
    my @params = split(m{ [;&] }xms, $params);
    my %params = map { decode_param($_) } @params;
    return \%params;
}

# Determine if a Kerberos password is valid for a user.
#
# $user     - Username (Kerberos principal)
# $password - Password to attempt
# $realm    - Kerberos realm, used to qualify $user if needed
#
# Returns true if valid, false otherwise.  Throws string exceptions on
# internal Kerberos errors.
sub password_validate {
    my ($user, $password, $realm) = @_;

    # Fully-qualify the principal as needed and parse it.
    if ($user !~ m{ \@ }xms) {
        $user .= q{@} . $realm;
    }
    my $client = Authen::Krb5::parse_name($user)
      or die "Internal error parsing Kerberos principal\n";

    # Attempt the authentication.
    my $creds = Authen::Krb5::get_init_creds_password($client, $password);
    return defined $creds;
}

##############################################################################
# Reporting
##############################################################################

# Add an authentication count to a table that records number of users found by
# number of authentications, respecting $COUNT_THRESHOLD.  In other words,
# each key of the hash is the number of times that user's password was seen in
# the logs, and the value is the number of users seen with that frequency.
#
# $count      - Number of authentications
# $counts_ref - Hash reference storing frequency buckets
#
# Returns undef.
sub store_count {
    my ($count, $counts_ref) = @_;
    my $key = $count;
    if ($count > $COUNT_THRESHOLD) {
        $key = q{>} . $COUNT_THRESHOLD;
    }
    $counts_ref->{$key}++;
    return;
}

# Analyze log parsing results and print out reports.
#
# $auth_ref   - Reference to hash of authentication data w/keys:
#   date     - Date of authentication
#   agent    - User agent from Apache logs
#   host     - Client host
#   password - User's password
# $realm      - If present, Kerberos realm for checking passwords
#
# Returns undef.
sub analyze_auth_data {
    my ($auth_ref, $realm) = @_;
    my (%count_of, %valid_count_of);
    my $total  = 0;
    my $atrisk = 0;

    # Walk through the data hash and total up the results.
  USER:
    for my $user (sort keys %{$auth_ref}) {
        my $date = strftime('%Y-%m-%d %T', localtime $auth_ref->{$user}{date});
        my $count    = $auth_ref->{$user}{count};
        my $password = $auth_ref->{$user}{password};

        # If a Kerberos realm was given, check whether this password is valid.
        my $valid;
        if ($realm) {
            $valid = password_valid($user, $password, $realm);
        }

        # Print out the record of this user.
        printf "%-8s  %4d time%s  Last: %s %s (from %s)\n", $user, $count,
          ($count == 1 ? q{ } : q{s}), ($valid ? 'GP' : q{  }), $date,
          $auth_ref->{$user}{host}
          or die "Cannot write to STDOUT: $!\n";

        # Increment our counts.
        $total++;
        store_count($count, \%count_of);
        if ($valid) {
            $atrisk++;
            store_count($count, \%valid_count_of);
        }
    }

    # Report the total number of users.
    printf "\n%4d total users\n", $total
      or die "Cannot write to STDOUT: $!\n";
    if ($realm) {
        printf "%4d users with still-valid passwords\n", $atrisk
          or die "Cannot write to STDOUT: $!\n";
    }

    # Give a breakdown of users by the number of incidents.
    print "\nBreakdown of all users:\n"
      or die "Cannot write to STDOUT: $!\n";
    for my $count (sort keys %count_of) {
        printf "%2s time%s  %4d users (%4.1f%%)\n", $count,
          ($count eq '1' ? q{ } : q{s}), $count_of{$count},
          ($count_of{$count} / $total * 100)
          or die "Cannot write to STDOUT: $!\n";
    }

    # If we were checking passwords, also break down the users with
    # still-valid passwords based on the number of incidents.
    if ($realm) {
        print "\nBreakdown of users with still-good passwords:\n"
          or die "Cannot write to STDOUT: $!\n";
        for my $count (sort keys %valid_count_of) {
            printf "%2s time%s  %4d users (%4.1f%%)\n", $count,
              ($count eq '1' ? q{ } : q{s}), $valid_count_of{$count},
              ($valid_count_of{$count} / $total * 100)
              or die "Cannot write to STDOUT: $!\n";
        }
    }
    return;
}

##############################################################################
# Main routine
##############################################################################

# Parse command-line options.
my ($help, $realm, $users_file);
Getopt::Long::config('bundling');
GetOptions(
    'h|help'    => \$help,
    'r|realm=s' => \$realm,
    'u|users=s' => \$users_file
) or exit 1;
if ($help) {
    print "Feeding myself to perldoc, please wait....\n"
      or die "Cannot write to STDOUT: $!\n";
    exec('perldoc', '-t', $0);
}

# Parse the password file to get a list of valid users.
my %valid_users;
if ($users_file) {
    open(my $users_fh, '<', $users_file)
      or die "Cannot open $users_file: $!\n";
    local $_ = q{};
    while (<$users_fh>) {
        my ($user) = split m{:}xms;
        $valid_users{$user} = 1;
    }
    close($users_fh) or die "Cannot close $users_file: $!\n";
}

# Initialize Kerberos if a realm was specified.
if ($realm) {
    require Authen::Krb5;
    Authen::Krb5::init_context();
}

# Parse the Apache log and accumulate user information in %users and user
# agent information (not currently used) in %agents.
my (%auth_for, %agents);
ACCESS:
while (defined(my $line = <>)) {
    $line =~ s{ ^ [\w./-]+ : }{}xms;

    # See if this is a long line that we care about.
    my ($host, $date, $query, $status, $refer, $agent) = $line =~ $LOG_REGEX;
    next ACCESS if !defined $host;

    # See if this is a GET of the login form.
    my ($params) = $query =~ $QUERY_REGEX;
    next ACCESS if !defined $params;

    # Check whether username, password, and login are set in the form.
    my $params_ref = decode_params($params);
    next ACCESS if (!$params_ref->{username} || !$params_ref->{password});
    next ACCESS if !$params_ref->{login};

    # This is a problematic login.  But skip if not in the user list.
    my $user = $params_ref->{username};
    next ACCESS if (%valid_users && !$valid_users{$user});

    # Convert the date to seconds since epoch.
    $date =~ s{ : }{ }xms;
    $date = str2time($date);

    # Remember the user agent.
    if ($agent) {
        $agents{$agent}++;
    }

    # Remember the user.
    $auth_for{$user}{count}++;

    # If this is a new user or later login, remember the information.
    if (!$auth_for{$user}{date} || $auth_for{$user}{date} < $date) {
        $auth_for{$user}{date}     = $date;
        $auth_for{$user}{agent}    = $agent;
        $auth_for{$user}{host}     = $host;
        $auth_for{$user}{password} = $params_ref->{password};
    }
}

# We now have all the data.  Report on the users.
analyze_auth_data(\%auth_for, $realm);

__END__

##############################################################################
# Documentation
##############################################################################

=for stopwords
Allbery weblogin-passcheck WebLogin TimeDate CPAN WebAuth login logins
usernames Kerberos username sublicense MERCHANTABILITY NONINFRINGEMENT

=head1 NAME

weblogin-passcheck - Check Apache access logs for disclosed WebLogin passwords

=head1 SYNOPSIS

B<weblogin-passcheck> [B<-h>] [B<-r> I<realm>] [B<-u> I<users>] I<log>
    [I<log> ...]

=head1 REQUIREMENTS

Perl 5.6 or later and the Date::Parse Perl module, which is part of the
TimeDate distribution on CPAN.  To check whether the user's password is
still valid, the Authen::Krb5 module, also available from CPAN, is
required.

=head1 DESCRIPTION

Versions of the WebLogin script included in WebAuth releases 3.5.5 through
3.6.1 could potentially convert the user login via POST to a GET, thus
exposing the user's password in the URL and possibly to other web servers
via referrer.  This script scans Apache access logs (in the combined,
host_combined, or common log formats) for WebAuth logins via GET and
produces a report of affected users.

Optionally, B<weblogin-passcheck> can also filter out invalid usernames
(usually due to brute-force intrusion attacks) given a list of users via
the B<-u> option.  Also optionally, B<weblogin-passcheck> can check
whether the password logged is still valid if given a default Kerberos
realm to construct Kerberos principals via the B<-r> option.  This option
requires the Authen::Krb5 Perl module.

=head1 OPTIONS

=over 4

=item B<-h>, B<--help>

Print out this documentation (which is done simply by feeding the script
to C<perldoc -t>).

=item B<-r> I<realm>, B<--realm>=I<realm>

Check any detected passwords to see if they are still valid.  Entries for
users with a password that's still valid will have C<GP> in the report.
The I<realm> is used to create the Kerberos principal from the username if
the username does not contain an C<@>.

=item B<-u> I<users>, B<--users>=I<users>

If this option is given, the I<users> argument is the path to a file
containing a list of valid users and any user not listed in that file will
be filtered out of the report.  The file may either be one username per
line or in the UNIX password file format.

This option assumes that usernames in the WebLogin logs will be
unqualified.  If the WebLogin server is used to authenticate users from
multiple realms, the full principals as occur in the username form
parameter on the WebLogin server must be listed in this file.

=back

=head1 CAVEATS

This script assumes the recommended URL for the WebLogin login script and
requires that the logs be in one of the standard Apache log formats.

=head1 SEE ALSO

This script is is part of the WebAuth distribution, the current version of
which can be found at L<http://webauth.stanford.edu/>.

=head1 AUTHOR

Russ Allbery <eagle@eyrie.org>

=head1 COPYRIGHT AND LICENSE

Copyright 2009, 2012, 2013 The Board of Trustees of the Leland Stanford
Junior University

Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

=cut
