#!/usr/bin/perl ################################################################## # Author: Brent Hughes # # Date: 11/8/03 # # Program: rget-links.pl # # Purpose: This is a program to retrieve links from web pages # recursively. This may not seem terribly useful at # first, but by feeding the output to grep one can # easily acquire a list of files matching a certain # pattern. Automate lwp-download over the resulting # list and you'll see my point. This method can easily # be used to aquire thousands of movie files and images # or whatever else you can think of. # # Bugs: Please report any bugs to brent_hughes_@hotmail.com # Also feel free to comment on the program's function # and/or propose additional features. # ################################################################## use warnings; use strict; package RGetLinks; use LWP::UserAgent; use HTML::LinkExtor; use URI::URL; use Getopt::Long; $| = 1; # global data for this program my $depth; my %files; # command line options my $opt_depth = 3; # retrieve command line options my $options = GetOptions ("depth=i" => \$opt_depth); # numeric # acquire url from command line my $url = shift; # abort if the options are improperly formatted if(!defined $url){ usage(); } # program enters actual processing at this point rgetlinks($url,$opt_depth); ################################################################# # Subroutines # A routine to get links recursively sub rgetlinks { my($url,$maxdepth) = @_; chomp($url); # initialize globals $depth = 0; %files = (); # descend rgetlinkshelper($url,$maxdepth); } # A helper routine to get links recursively sub rgetlinkshelper { my($url,$maxdepth) = @_; # return if too deep or already been here if($depth >= $maxdepth || defined $files{$url}) { return; } else { # drop down a level and add the file to the hash $depth++; $files{$url} = 1; # show our current location foreach(1..$depth) {print ' ';} print $url, "\n"; # retrieve all links my @links = getlinks($url); # recursive step foreach(@links){ rgetlinkshelper($_,$maxdepth); } # pop up a level $depth--; } } # A routine to return all links from a URL # This routine was borrowed almost verbatim from an example program. # However, I did optimize it to only retrieve links from text/html # files. The program was trying to retrieve links from large movie # files. That didn't work to well and took up a lot of computation time. my @links = (); sub getlinks { my($url) = @_; # for instance my $ua = new LWP::UserAgent; # Make the parser. Unfortunately, we don't know the base yet # (it might be diffent from $url) @links = (); my $p = HTML::LinkExtor->new(\&callback); # Look at the header to determine what type of document we have my $headreq = HTTP::Request->new(HEAD => $url); my $headres = $ua->request($headreq); my $type = $headres->header('content-type'); # only parse the document for links if it is a text or html document if(defined $type && $type =~ /text|html/) { # Request document and parse it as it arrives my $getreq = HTTP::Request->new(GET => $url); my $getres = $ua->request($getreq, sub{ $p->parse($_[0])}); # Expand all URLs to absolute ones my $base = $getres->base; @links = map { $_ = url($_, $base)->abs; } @links; } # Return the links return @links; } # Set up a callback that collects links sub callback { my($tag, %attr) = @_; return if $tag ne 'a'; # we only look closer at push(@links, values %attr); } # A routine to provide instructions sub usage { # strip the progname with a regex my $progname = $0; $progname =~ s/(.*\\|.*\/)(.*)/$2/g; # show instructions print "\nUsage:\n\t\t", $progname, " [args] target-url > output-file\n\n", "Example:\n\t\t", $progname, " --depth=3 http://www.perl.org\n\n"; print "Options\n", "=======\n", "--depth\t\t", "The maximum depth of links to traverse (default = 3)\n"; exit(); }