#!/usr/local/bin/perl -w =head1 NAME timefetch.pl - Fetch a web page and time how long it takes =head1 SYNOPSIS C C =head1 README This simple script uses LWP and Time::HiRes to fetch a web page, parse embedded IMG tags and other tags, download the images, and time the whole operation. Depending on the command line options you use, it can just download a simgle HTML page or recursively download all embedded images and applets, and tally a grand total download time and total download size. This script always downloads embedded frames in framesets. This script does not implement a smart browser. It basically does two kinds of benchmarks: best-case scenario and absolute worst-case scenario. The best-case scenario is when a browser downloads just the contents of a single HTML page, and does not download any images, because they are in the cache. The worst-case scenario is when the browser must download each and every image, one after the other. Neither one of these scenarios exactly matches real-live scenarios you experience with a browser. However, these numbers are an objective measurement of a repeatible and well-defined process. You can use them for straightforward comparison of the download times of two different web pages, or chart the download time of a single page during different kinds of loads and traffic patterns. One extra feature in this script is the "force-host" feature. You can force this script to munge each URL before fetching it. This is useful in a situation where the URLs in your HTML pages all contain a load-balanced hostname, but you want to target a specific member host, or compare two member hosts. This script is NOT a spider. It will only fetch the inline images and applets contained in a single web page. However, it will download component frames in a frameset down to arbitrary levels. =head1 EXAMPLES C C C =head1 PREREQUISITES This script requires C and C. As written, it requires Perl 5.004, but in a pinch you could modify it to run on earlier versions of Perl 5. =head1 COPYRIGHT Copyright (c) 1998 John Nolan. All rights reserved. This program is free software. You may modify and/or distribute it under the same terms as Perl itself. This copyright notice must remain attached to the file. =head1 REVISION $Id: timefetch.pl,v 1.6 1998/11/05 02:53:30 john Exp $ =pod SCRIPT CATEGORIES Web/Utilities =cut # --------------------------------------------------------------------------- my $VERSION = 0.20; $|++; # Turn on autoflush use 5.004; # We will use "for my" # --------------------------------------------------------------------------- # Load modules # use Time::HiRes; use HTTP::Request; use LWP::UserAgent; use HTML::LinkExtor; use URI::URL; use Getopt::Std; use strict; # --------------------------------------------------------------------------- # Define global variables and objects up front # my ($Ua, $Pa, @Urls, $TotalTime, $TotalSize, $RealUrl); my ($Debug, $Recursively, $Verbose, $Java, $ForceHost); # This variable will remember whether we had to recurse # in order to download frames. It will contain a list of # the *types* of files which we downloaded additionally # (e.g., img, applet). # my $DidRecurse; # These will hold lists of urls, along with time and size data. # my %Cache; my @SummaryCache; # --------------------------------------------------------------------------- # Subroutines # --------------------------------------------------------------------------- # --------------------------------------------------------------------------- # sub Usage { return "Usage: $0 [dhjrv] [-f host] http://url [http://url ... ]\n"; } # --------------------------------------------------------------------------- # sub PrintHelpMsg { print Usage(); print <{TIME}, $Cache{$url}->{SIZE}/1000, $url); } # --------------------------------------------------------------------------- # Fetch a URL and time how long it takes # sub FetchUrl { my $url = (shift or die "Internal routine FetchUrl expects param!\n"); print "Fetching: $url\n" if $Debug; my $req = HTTP::Request->new (GET => $url); # Temporarily turn off warnings. The bowels of LWP are returning # an obscure error here, which I don't know how to troubleshoot. # $^W = 0; my $start = Time::HiRes::gettimeofday(); my $res = $Ua->request ($req); my $finish = Time::HiRes::gettimeofday(); $^W = 1; # Turn warnings back on again. # Calculate stats my $size = length $res->content; my $time = ($finish - $start); return ($time,$size,$res); } # --------------------------------------------------------------------------- # HandleParsedLink # # This is a callback provided for handling HTML links found during parsing. # It is called once for each link found, with the $tag and %link # passed as parameters. $tag is the HTML tag where the link was found. # %links is a hash that contains the keyword/value pairs from the links # that contain URLs. # # For example, if an HTML anchor was found, the $tag would be "a" # and %links would be (href=>"url"). We check each URL in %links. # sub HandleParsedLink { my ($tag, %links) = @_; my ($time, $size, $res, $urlobj, $url); # Flag for deciding whether we want to download a given URL # my $we_want_this_url = 0; print "\$tag = $tag\n" if $Debug; $we_want_this_url = 1 if $Recursively and $tag =~ /img/; $we_want_this_url = 1 if $Java and $tag =~ /applet/; $we_want_this_url = 1 if $tag =~ /frame/; return unless $we_want_this_url; # Examine the tag, and fetch its target if necessary. # for my $key (keys %links) { print "$tag: $key: $links{$key}\n" if $Debug; # Get the absolute URL # $urlobj = URI::URL->new( $links{$key}, $RealUrl ); $url = $urlobj->abs; # Force a particular host by munging the actual URL # $url =~ s#http://[^/]+#http://$ForceHost# if $ForceHost; # Remove any in-page anchor tags. # $url =~ s/^([^#]*)#.*$/$1/; print "Examining url $url\n" if $Debug; # Process each URL that we have not seen before # unless (exists $Cache{$url}) { ($time,$size,$res) = FetchUrl($url); $TotalSize += $size; $TotalTime += $time; $Cache{$url} = { TIME => $time, SIZE => $size }; PrintUrl($url) if ($Verbose); # Keep a list each *type* of item # that we have recursively downloaded # $DidRecurse .= " $tag" unless $DidRecurse =~ /$tag/; # If this was a frame tag, then recurse into it. # $Pa->parse($res->content) if $tag =~ /frame/; } } 1; # Return a nice happy true value. } # --------------------------------------------------------------------------- # Process each URL in the list. # sub ExamineUrls { my @urls = @_; for my $url (@urls) { my ($size,$time,$res); $TotalTime = 0; $TotalSize = 0; $RealUrl = ""; $DidRecurse = ""; # Force a particular host by munging the actual URL # if ($ForceHost) { $url =~ s#http://[^/]+#http://$ForceHost#; } print "Examining url $url\n" if $Debug; # We make two fetches, no matter what. # First fetch. This fetch will traverse redirects, so it is added # to the total stats for the main url. We only care about the total # if we are doing a recursive download. As a side effect, after traversing # redirects, we learn the true URL of the page ($RealUrl). # Fetch the URL! # ($time,$size,$res) = FetchUrl($url); if ($Recursively) { $TotalSize += $size; $TotalTime += $time; } # Now remember the URL we were redirected to. # NOTE this call to base() must come before we invoke # the parse function. Otherwise base() may croak. # $RealUrl = $res->base(); # Analyze the HTML we got back, and extract links. # The handler HandleParsedLink will download each # linked image, if we are doing a recursive download. # It will also download framesets, and parse them recursively. # $Pa->parse($res->content); # Force a particular host by munging the actual URL # $RealUrl =~ s#http://[^/]+#http://$ForceHost# if $ForceHost; print "Examining RealUrl $RealUrl\n" if ($Debug); # Second fetch. This fetch does not involve any redirects, # because we are hitting the real URL directly. The data # is not tallied into the total for the main URL. But we get # the data anyway, so we have it for just that URL alone. # We will need to report these values in every case. unless (exists $Cache{ $RealUrl }) { ($time,$size) = FetchUrl( $RealUrl ); # Tally and store stats # $TotalSize += $size; $TotalTime += $time; $Cache{ $RealUrl } = { TIME => $time, SIZE => $size }; } push @SummaryCache, $RealUrl; if ($Recursively or $DidRecurse) { # Store the whole recursive fetch with a special URL string. # my $recursiveUrl = "$url (incl.:$DidRecurse)"; $Cache{$recursiveUrl} = { TIME => $TotalTime, SIZE => $TotalSize }; push @SummaryCache, $recursiveUrl; } } } # --------------------------------------------------------------------------- # sub PrintFinalResults { my $Separator = "----------------------------------------------------\n"; print $Separator if ($Verbose or $Debug); for my $url (@SummaryCache) { PrintUrl($url); } } # --------------------------------------------------------------------------- # MAIN LOGIC # --------------------------------------------------------------------------- @Urls = GetParams(); # Set up a browser # $Ua = LWP::UserAgent->new; $Ua->agent("Mozilla (compatible: LWP $LWP::VERSION)"); $Ua->timeout(30); # Set up a parser # $Pa = HTML::LinkExtor->new (\&HandleParsedLink); ExamineUrls( @Urls ); PrintFinalResults(); # --------------------------------------------------------------------------- # END # --------------------------------------------------------------------------- 1;