#!/usr/bin/perl -w
$CHILDREN = 40; # Number of children to spawn
$TIMEOUT = 30; # DNS timeout
$BUFFER = 60000; # Maximum number of log lines to keep in memory
$FLUSH = 3000; # Flush output buffer every $FLUSH_LINES lines
$STATUS = 0; # Display status message every $STATUS lines
$TTL = 86400 * 7; # Seconds until disk cached ips are expired
$DEBUG = 1;
# ip2host v0.06 - Resolve IPs to hostnames in web server logs
# Maurice Aubrey <maurice@hevanet.com>
#
# Usage: ip2host [OPTIONS] [cache_file] < infile > outfile
#
# $Id: ip2host,v 1.4 2001/01/30 11:16:49 maurice Exp $
#
# CHANGES:
#
# 0.06 Tue Jan 30 02:56:18 PST 2001
# - Added command-line options
# - Place restriction on number of log lines kept
# in memory
# - Add DB_File subclass which understands how to
# expire IPs
# - Cache IPs in memory when read from disk cache
# (substantial speed improvement)
# - No longer using IO::Select module
#
# 0.05 Fri Apr 14 05:31:38 PDT 2000
# - Add POD to allow inclusion in CPAN
#
# 0.04 Mon Nov 22 17:54:07 PST 1999
# - Check socketpair() return value
# - Updated documentation
#
# 0.03 Thu Nov 18 16:57:53 PST 1999
# - Renamed $BUFFER to $FLUSH
# - Improved documentation
#
# 0.02 Sat Oct 16 00:05:29 PDT 1999
# - Initial public release
#
# Benchmark comparison between ip2host and logresolve.pl:
#
# [maurice@foo maurice]$ wc -l access_log
# 200000 access_log
#
# [maurice@foo maurice]$ time ./ip2host --timeout=5 < access_log >/dev/null
# 26.11user 1.41system 0:36.13elapsed 76%CPU (0avgtext+0avgdata 0maxresident)k
# 0inputs+0outputs (280major+5748minor)pagefaults 0swaps
#
# [maurice@foo maurice]$ time ./logresolve.pl < access_log > /dev/null
# 19.01user 1.48system 9:10.98elapsed 3%CPU (0avgtext+0avgdata 0maxresident)k
# 0inputs+0outputs (291major+1379minor)pagefaults 0swaps
use strict;
use vars qw(
$CHILDREN $TIMEOUT $BUFFER $FLUSH $STATUS $TTL $DEBUG $VERSION %Cache %Opt
);
use Socket;
use Symbol ();
use Getopt::Long ();
$VERSION = '0.06';
{
package DB_File::ip2host;
use strict;
use vars qw( @ISA $DEBUG $TTL %Cache );
use Carp;
$DEBUG = 1;
@ISA = qw( DB_File );
# Delay loading of DB_File module until this package is
# actually needed.
sub init {
my %args = @_;
$args{ttl} or croak "no ttl specified";
$TTL = $args{ttl};
require DB_File;
}
# In order to implement EXISTS, we'd need to parse
# the value to see if the ip has expired, which is just
# as expensive as FETCH. So we'll just make sure we
# never use it.
sub EXISTS { croak "exists not implemented!"; }
sub FETCH {
my $self = shift;
my $ip = shift;
return $Cache{ $ip } if exists $Cache{ $ip };
my $val = $self->SUPER::FETCH( $ip );
defined $val or return $Cache{ $ip } = undef;
my($utc, $host) = split /:/, $val, 2;
time - $utc < $TTL or return $Cache{ $ip } = undef;
return $Cache{ $ip } = $host;
}
sub STORE {
my $self = shift;
my($ip, $host) = @_;
return $host if defined $Cache{ $ip };
$self->SUPER::STORE( $ip => (time . ':' . $host) );
$Cache{ $ip } = $host;
}
}
{
my $last_msg = ''; # closure to remember last msg displayed
sub status {
my $msg = shift;
my $gap = length($last_msg) - length($msg);
$last_msg = $msg;
join '', "\r", $msg, ($gap > 0 ? " " x $gap : '');
}
}
sub usage {
my $exit = shift || 0;
print STDERR <<EOF;
$0 version $VERSION
Usage: $0 [OPTIONS] [cache_file] < input_log > output_log
infile Web server log file. Any log format is acceptable,
as long as each line begins with the remote client's
IP address.
outfile Same as input file, but with all of the IPs resolved
to hostnames.
Options:
--help Display this help and exit
--children=... Number of child processes to spawn (default: $CHILDREN)
--timeout=... Seconds to wait on DNS response (default: $TIMEOUT)
--buffer=... Maximum number of log lines to keep in
memory (default: $BUFFER)
--flush=... Number of lines to process before flushing
output buffer (default: $FLUSH)
--status=... Display status message every N lines (default: none)
--cache=... Filename to use as disk cache (default: none)
--ttl=... Number of seconds before IP cached on disk is expired
(default: $TTL)
See the POD for more details:
perldoc $0
Copyright 1999-2001, Maurice Aubrey <maurice\@hevanet.com>.
All rights reserved.
This module is free software; you may redistribute it and/or
modify it under the same terms as Perl itself.
EOF
exit $exit;
}
sub child_resolver {
my($fh, $timeout) = @_;
$SIG{'ALRM'} = sub { die 'alarmed' };
while(defined(my $ip = <$fh>)) { # Get IP to resolve
chomp($ip);
my $host = undef;
eval { # Try to resolve, but give up after $TIMEOUT seconds
alarm( $timeout );
my $ip_struct = inet_aton $ip;
$host = gethostbyaddr $ip_struct, AF_INET;
alarm(0);
};
# XXX Debug
if ($DEBUG and $@ =~ /alarm/) {
$host ||= 'TIMEOUT';
# print STDERR "Alarming ($ip)...\n";
}
$host ||= $ip;
print $fh "$ip $host\n";
}
}
%Opt = (
children => $CHILDREN,
timeout => $TIMEOUT,
buffer => $BUFFER,
flush => $FLUSH,
status => $STATUS,
cache => undef,
ttl => $TTL,
);
Getopt::Long::GetOptions(\%Opt,
"children|kids=i",
"timeout=i",
"buffer=i",
"flush=i",
"status=i",
"ttl=i",
"cache=s",
"usage|help|version",
);
usage(0) if $Opt{usage};
usage(1) if @ARGV > 1;
$Opt{cache} = shift @ARGV if @ARGV;
# Spawn the children
my %kids = (
wtr_vec => '',
rdr_vec => '',
fh => {},
max_fileno => 0,
min_fileno => undef,
);
for(my $child = 1; $child <= $Opt{children}; $child++) {
my($child_fh, $parent_fh) = (Symbol::gensym, Symbol::gensym);
socketpair($child_fh, $parent_fh, AF_UNIX, SOCK_STREAM, PF_UNSPEC)
or die "$0 socketpair failed: $!";
select $child_fh; $| = 1;
select $parent_fh; $| = 1;
select STDOUT;
if (my $pid = fork) {
close $parent_fh;
my $child_fileno = fileno( $child_fh );
$kids{fh}{ $child_fileno } = $child_fh;
vec($kids{wtr_vec}, $child_fileno, 1) = 1; # start out writing to all
$kids{min_fileno} = $child_fileno unless defined $kids{min_fileno};
$kids{max_fileno} = $child_fileno;
} else { # Child starts here
defined $pid or die "$0 fork failed: $!";
close $child_fh; close STDIN; close STDOUT;
child_resolver( $parent_fh, $Opt{timeout} );
exit 0;
}
}
if ($Opt{cache}) { # Cache results to disk if asked
DB_File::ip2host::init( ttl => $Opt{ttl} );
tie %Cache, 'DB_File::ip2host', $Opt{cache}
or die "$0 unable to tie '$Opt{cache}': $!";
}
my $lineno = 0;
my $next_line = 1;
my $is_eof = 0;
my $buffer_used = 0;
my $queries = 0;
my %output;
my %pending;
# Write as many lines as we can until we come across one
# that's missing (that means it's still pending DNS).
sub flush_output {
for (; exists $output{ $next_line }; $next_line++) {
print delete $output{ $next_line };
--$buffer_used;
}
}
sub myselect {
# If buffer is full, only wait on responses
my $wtr_vec = ($buffer_used >= $Opt{buffer} ? undef : $kids{wtr_vec});
my($rout, $wout);
select($rout = $kids{rdr_vec}, $wout = $wtr_vec, undef, undef);
my @readable;
if ($rout) {
for(my $i = $kids{min_fileno}; $i <= $kids{max_fileno}; $i++) {
next unless vec($rout, $i, 1);
# move from read to write set
vec($kids{rdr_vec}, $i, 1) = 0;
vec($kids{wtr_vec}, $i, 1) = 1 unless $is_eof;
push @readable, $kids{fh}{ $i };
}
}
my @writeable;
if ($wout) {
for(my $i = $kids{min_fileno}; $i <= $kids{max_fileno}; $i++) {
next unless vec($wout, $i, 1);
# move from write to read set
vec($kids{wtr_vec}, $i, 1) = 0;
vec($kids{rdr_vec}, $i, 1) = 1;
push @writeable, $kids{fh}{ $i };
}
}
(\@readable, \@writeable);
}
sub show_status {
print STDERR status(join ' ',
"line=$lineno",
"pending=" . scalar keys %pending,
"queries=$queries",
# "scalar keys" is very expensive against a DB file, so we
# call it directly on its in-memory store.
"ip_cache=" . ($Opt{cache} ? scalar keys %DB_File::ip2host::Cache
: scalar keys %Cache),
"buffer=$buffer_used",
);
}
while(1) {
my($readable, $writeable) = myselect;
# One or more children ready for an IP
while (@$writeable and $buffer_used < $Opt{buffer}) {
my $line = <STDIN>;
$is_eof = 1, last unless defined $line;
++$lineno;
my($ip, $rest) = split / /, $line, 2;
if (my $cached_ip = $Cache{ $ip }) { # We found this answer already
$output{ $lineno } = "$cached_ip $rest";
} elsif (exists $pending{ $ip }) { # We're still looking
push @{ $pending{ $ip } }, [ $lineno, $rest ];
} else { # Send IP to child
my $write_fh = shift @$writeable;
print $write_fh "$ip\n";
$pending{ $ip } = [ [ $lineno, $rest ] ];
$queries++;
}
$buffer_used++;
flush_output if exists $output{ $next_line } and
($buffer_used >= $Opt{buffer} or $lineno % $Opt{flush} == 0);
show_status if $Opt{status} and $lineno % $Opt{status} == 0;
}
while (@$readable) { # One or more children have an answer
my $read_fh = shift @$readable;
my $str = <$read_fh>;
defined $str or next;
chomp($str);
my($ip, $host) = split / /, $str, 2;
$Cache{ $ip } = $host;
# Take all the lines that were pending for this IP and
# toss them into the output buffer
foreach my $pending (@{ $pending{ $ip } }) {
$output{ $pending->[0] } = "$host $pending->[1]";
}
delete $pending{ $ip };
flush_output if exists $output{ $next_line } and
($buffer_used >= $Opt{buffer} or $lineno % $Opt{flush} == 0);
show_status if $is_eof and $Opt{status};
}
last if $is_eof and not keys %pending;
}
flush_output;
print STDERR "\n" if $Opt{status};
=pod
=head1 NAME
ip2host - Resolve IPs to hostnames in web server logs
=head1 SYNOPSIS
ip2host [OPTIONS] [cache_file] < infile > outfile
infile - Web server log file. Any log format is acceptable,
as long as each line begins with the remote client's
IP address.
outfile - Same as input file, but with all of the IPs resolved
to hostnames.
=head1 DESCRIPTION
This script is a drop-in replacement for the logresolve.pl
script distributed with the Apache web server.
ip2host has the same basic design of forking children to handle
the DNS resolution in parallel, but multiplexes the communication
to minimize the impact of slow responses. This results in a
significant speed improvement (approximately 10x faster), and
the performance degrades more gracefully as the DNS timeout value
is increased.
For a description of the command-line options type:
./ip2host --help
This script is reported to work under Linux, FreeBSD, Solaris,
Tru64, and IRIX.
=head1 AUTHOR
Maurice Aubrey E<lt>maurice@hevanet.comE<gt>
=head1 COPYRIGHT
Copyright 1999-2001, Maurice Aubrey E<lt>maurice@hevanet.comE<gt>.
All rights reserved.
This module is free software; you may redistribute it and/or
modify it under the same terms as Perl itself.
=head1 README
Drop-in replacement for the logresolve.pl script distributed
with the Apache web server that's approximately 10x faster.
=head1 SCRIPT CATEGORIES
Web
=cut