#!/usr/bin/perl -w # # sitescooper - download news from web sites and convert it automatically # into one of several formats suitable for viewing on a Palm # handheld. # # Skip down to read the POD documentation. # # To set up, search for "CUSTOMISE" -- note UK/Irish spelling. ;) # Change the setting appropriately and uncomment it, if required. # Then move the required sites from the "sites_off" directory into the # "sites" directory, and those will be downloaded automatically next # time you run the script. $main::VERSION = "1.8"; $CONFIG = ' # NOTE: on Windows, you will need to use 2 backslashes in any paths in # this built-in configuration file, to avoid Perl interpreting them, # like so: C:\\TMP ####################################################################### # OPTIONAL SITE-DEPENDENT CONFIGURATION # NOTE: If you will be converting sites into Pilot-readable format, you # may need to specify this! The directory under your PalmPilot Desktop # dir where installable PRC files need to go. # # On UNIX platforms using pilot-xfer, you should set up a directory where # installed files are to go before you run "pilot-xfer -i". # # On a Win32 machine with only one Pilot user, this is determined # automatically from the registry, so you will not need to set it. # Otherwise, on Win32 platforms this is generally of the format # {pilotdir}/{username}/Install, where {pilotdir} is the PalmPilot Desktop # directory, and {username} is the abbreviation of the Pilot user name. # # MacOS users: you do need to customise this, a default will be used # (although nothing will be written there). You need to run the conversion # command yourself afterwards :( # PilotInstallDir: $HOME/pilot/install # CUSTOMISE ####################################################################### # Sites directory, where the site description files are stored. # By default, a directory called "sites" under your current directory, # or under your $HOME on UNIX, is used if it exists. # SitesDir: $HOME/lib/sites # CUSTOMISE ####################################################################### # Temporary directory to use for sitescooper. A subdirectory will be # created called sitescooper{uid} where {uid} is your user id (or # 0 on Win32 platforms). On UNIX platforms, this defaults to a hidden # directory under your home dir, for privacy. # TmpDir: /tmp # CUSTOMISE ####################################################################### # Specify the HTTP proxy server in use at your site, if applicable. # ProxyHost: proxy.clubi.ie # CUSTOMISE # ProxyPort: 80 # CUSTOMISE ####################################################################### # Diff will be searched for on the path if this is not specified here. # Diff: C:\\path\\to\\diff.exe # CUSTOMISE ####################################################################### # The MakeDoc tool will be searched for on the path if it is # not specified here. Default values are "makedoc" on UNIX platforms, # or "makedocw.exe" on Windows. # MakeDoc: makedocw.exe # CUSTOMISE # The iSilo conversion tool will be searched for on the path if it is # not specified here. Default values are "iSilo386" for UNIX platforms, # or "iSiloW32.exe" on Windows. # iSilo: iSiloW32.exe # CUSTOMISE ####################################################################### # Where you want the text-format output to be saved. If commented, # it will be saved under the sitescooper temporary directory. # Note: MakeDocW on Windows does not like reading text from a # directory with spaces in it, such as a directory under Program Files. # In this case changing this parameter may help. # TextSaveDir: C:\\where\\I\\want\\News # CUSTOMISE ####################################################################### # If you want to share a cache between multiple users or multiple # configurations of sitescooper, uncomment this. It will allow a # shared cache to be used. # SharedCacheDir: /home/jm/lib/scoop_cache # CUSTOMISE ####################################################################### # How long should cached files be kept before expiring? Specified in # days, and fractions are OK. # ExpireCacheAfter: 7.0 # CUSTOMISE '; #--------------------------------------------------------------------------- =head1 NAME sitescooper - download news from web sites and convert it automatically into one of several formats suitable for viewing on a Palm handheld. =head1 SYNOPSIS sitescooper [options] [-site sitename] sitescooper [options] [-levels n] [-storyurl regexp] url [...] Options: [-debug] [-refresh] [-config file] [-limit numkbytes] [-install dir] [-dump] [-dumpprc] [-nowrite] [-nodates] [-quiet] [-admin cmd] [-nolinkrewrite] [-stdout-to file] [-keep-tmps] [-text | -html | -mhtml | -doc | -isilo | -misilo | -richreader | -pipe fmt command] =head1 DESCRIPTION This script, in conjunction with its configuration file and its set of B files, will download news stories from several top news sites into text format and/or onto your Palm handheld (with the aid of the B/B or B utilities). Alternatively URLs can be supplied on the command line, in which case those URLs will be downloaded and converted using a reasonable set of default settings. HTTP and local files, using the C protocol, are both supported. Multiple types of sites are supported: =over 4 1-level sites, where the text to be converted is all present on one page (such as Slashdot, Linux Weekly News, BluesNews, NTKnow, Ars Technica); 2-level sites, where the text to be converted is linked to from a Table of Contents page (such as Wired News, BBC News, and I, Cringely); 3-level sites, where the text to be converted is linked to from a Table of Contents page, which in turned is linked to from a list of issues page (such as PalmPower). =back In addition sites that post news as items on one big page, such as Slashdot, Ars Technica, and BluesNews, are supported using diff. Note that at this moment in time, the URLs-on-the-command-line invocation format does not support 2- or 3-level sites. The script is portable to most UNIX variants that support perl, as well as the Win32 platform (tested with ActivePerl 5.00502 build 509). Currently the configuration is stored as a string inside the script itself, but an alternative configuration file can be specified with the B<-config> switch. The sites downloaded will be the ones listed in the site files you keep in your F directory. sitescooper maintains a cache in its temporary directory; files are kept in this cache for a week at most. Ditto for the text output directory (set with B in the built-in configuration). If a password is required for the site, and the current sitescooper session is interactive, the user will be prompted for the username and password. This authentication token will be saved for later use. This way a site that requires login can be set up as a .site -- just log in once, and your password is saved for future non-interactive runs. Note however that the encryption used to hide the password in the sitescooper configuration is pretty transparent; I recommend that rather than using your own username and password to log in to passworded sites, a dedicated, sitescooper account is used instead. =head1 OPTIONS =over 4 =item -refresh Refresh all links -- ignore the F file, do not diff pages, and always fetch links, even if they are available in the cache. =item -config file Read the configuration from B instead of using the built-in one. =item -limit numkbytes Set the limit for output file size to B kilobytes, instead of the default 200K. =item -install dir The directory to save PRC files to once they've been converted, in order to have them installed to your Palm handheld. =item -site sitename Limit the run to the site named in the B argument. Normally all available sites will be downloaded. To limit the run to 2 or more sites, provide multiple B<-site> arguments like so: -site ntk.site -site tbtf.site =item -levels n When specifying a URL on the command-line, this indicates how many levels a site has. Not needed when using .site files. =item -storyurl regexp When specifying a URL on the command-line, this indicates the regular expression which links to stories should conform to. Not needed when using .site files. =item -doc Convert the page(s) downloaded into DOC format, with all the articles listed in full, one after the other. =item -text Convert the page(s) downloaded into plain text format, with all the articles listed in full, one after the other. =item -html Convert the page(s) downloaded into HTML format, on one big page, with a table of contents (taken from the site if possible), followed by all the articles one after another. =item -mhtml Convert the page(s) downloaded into HTML format, but retain the multiple-page format. This will create the output in a directory called B; in conjunction with the B<-dump> argument, it will output the path of this directory on standard output before exiting. =item -isilo Convert the page(s) downloaded into iSilo format (see http://www.isilo.com/ ), on one big page. This is the default. The page(s) will be displayed with a table of contents (taken from the site if possible), followed by all the articles one after another. =item -misilo Convert the page(s) downloaded into iSilo format (see http://www.isilo.com/ ), with one iSilo document per site, with each story on a separate page. The iSilo document will have a table-of-contents page, taken from the site if possible, with each article on a separate page. =item -richreader Convert the page(s) downloaded into RichReader format using HTML2Doc.exe (see http://users.erols.com/arenakm/palm/RichReader.html ). The page(s) will be displayed with a table of contents (taken from the site if possible), followed by all the articles one after another. =item -pipe fmt command Convert the page(s) downloaded into an arbitrary format, using the command provided. Sitescooper will still rewrite the page(s) according to the B argument, which should be one of: =over 4 =item text Plain text format. =item html HTML in one big page. =item mhtml HTML in multiple pages. =back The command argument can contain C<__SCOOPFILE__>, which will be replaced with the filename of the file containing the rewritten pages in the above format, C<__SYNCFILE__>, which will be replaced with a suitable filename in the Palm synchronization folder, and C<__TITLE__>, which will be replaced by the title of the file (generally a string containing the date and site name). Note that for the B<-mhtml> switch, C<__SCOOPFILE__> will be replaced with the name of the file containing the table-of-contents page. It's up to the conversion utility to follow the href links to the other files in that directory. =item -dump Output the page(s) downloaded directly to stdout in text or HTML format, instead of writing them to files and converting each one. This option implies B<-text>; to dump HTML, use B<-dump -html>. =item -dumpprc Output the page(s) downloaded directly to stdout, in converted format as a PRC file, suitable for installation to a Palm handheld. =item -nodates Do not put the date in the installable file's filename. This allows you to automatically overwrite old files with new ones when you HotSync. =item -nowrite Test mode -- do not write to the cache or already_seen file, instead write what would be written normally to a directory called new_cache and a new_already_seen file. This is very handy when writing a new site file. =item -debug Enable debugging output. This output is in addition to the usual progress messages. =item -quiet Process sites quietly, without printing the usual progress messages to STDERR. Warnings about incorrect site files and system errors will still be output, however. =item -admin cmd Perform an administrative command. This is intended to ease the task of writing scripts which use sitescooper output. The following admin commands are available: =over 4 =item dump-sites List the sites which would be scooped on a scooping run, and their URLs. Instead of scooping any sites, sitescooper will exit after performing this task. The format is one site per line, with the site file name first, a tab, the site's URL, a tab, the site name, a tab, and the output filename that would be generated without path or extension. For example: S =item journal Write a journal with dumps of the documents as they pass through the formatting and stripping steps of the scooping process. This is written to a file called B in the sitescooper temporary directory. =back =item -nolinkrewrite Do not rewrite links on scooped documents -- leave them exactly as they are. =item -stdout-to file Redirect the output of sitescooper into the named file. This is needed on Windows NT and 95, where perl does not seem to support the > operator. =item -keep-tmps Keep temporary files after conversion. Normally the .txt or .html rendition of a site is deleted after conversion; this option keeps it around. =back =head1 INSTALLATION To install, edit the script and change the #! line. You may also need to (a) change the Pilot install dir if you plan to use the pilot installation functionality, and (b) edit the other parameters marked with CUSTOMISE in case they need to be customised for your site. They should be set to acceptable defaults (unless I forgot to comment out the proxy server lines I use ;). =head1 EXAMPLES sitescooper.pl http://www.ntk.net/ To snarf the ever-cutting NTKnow newsletter ("nasty, British and short"). =head1 ENVIRONMENT B makes use of the C<$http_proxy> environment variable, if it is set. =head1 AUTHOR Justin Mason Ejustin_mason /at/ bigfoot.comE =head1 COPYRIGHT Some of the post-processing and HTML cleanup code include ideas and code shamelessly stolen from http://pilot.screwdriver.net/ , Christopher Heschong's webpage-to-pilot conversion tool, which I discovered after writing a fully-working version of this script! Looks like I reinvented the wheel again on this one ;) Eh, anyway, the remainder of the code is copyright Justin Mason 1998-1999, and is free software and as such is redistributable and modifiable under the same terms as Perl itself. Justin can be reached at . =head1 SCRIPT CATEGORIES The CPAN script category for this script is C. See http://www.cpan.org/scripts/ . =head1 PREREQUISITES C C C C C C All these can be picked up from CPAN at http://www.cpan.org/ . =head1 COREQUISITES C, if running on a Win32 platform, to find the Pilot Desktop software's installation directory. =head1 README Sitescooper downloads news stories from the web and converts them to Palm handheld iSilo, DOC or text format for later reading on-the-move. Site files and full documentation can be found at http://sitescooper.tsx.org/ . =cut #--------------------------------------------------------------------------- sub usage { die <<__ENDOFUSAGE; Sitescooper - downloads news stories from the web and converts them to Palm handheld iSilo, DOC or text format for later reading on-the-move. sitescooper [options] [-site sitename] sitescooper [options] [-levels n] [-storyurl regexp] url [...] Options: [-debug] [-refresh] [-config file] [-limit numkbytes] [-install dir] [-dump] [-dumpprc] [-nowrite] [-nodates] [-quiet] [-admin cmd] [-nolinkrewrite] [-stdout-to file] [-keep-tmps] [-text | -html | -mhtml | -doc | -isilo | -misilo | -richreader | -pipe fmt command] Both file:/// and http:/// URLs are supported. Version: $main::VERSION __ENDOFUSAGE } #--------------------------------------------------------------------------- # use Carp; # use strict; # ah shaggit, life's too short for strict use File::Find; use File::Path; use LWP::UserAgent; use URI::URL; use HTTP::Date; use HTTP::Request::Common; #CGI use CGI; if (&Portability::MyOS eq 'Win32') { eval 'use Win32::TieRegistry( Delimiter=>"/", ArrayValues=>0 );'; } $SIG{__WARN__} = 'warn_log'; $SIG{__DIE__} = 'die_log'; $OUT_TEXT = 0; $OUT_DOC = 1; $OUT_HTML = 2; $main::home_url = "http://sitescooper.tsx.org"; $main::refresh = 0; #$main::just_caching = 0; $main::cached_front_page_lifetime = 60; # in minutes $main::dump = 0; $main::dumpprc = 0; $main::debug = 0; $main::debugdiffs = 0; # set to 1 to break after diffing $main::verbose = 1; $main::nowrite = 0; $main::nodates = 0; $main::bookmark_char = "\x8D"; # yes, same as Chris' one, cheers! undef $main::pilotinstdir; $main::use_convert_tool = 0; $main::cgimode = 0; $main::outstyle = $OUT_HTML; $main::outputfilter = 'isilo'; $main::fileperpage = 0; $main::nolinkrewrite = 0; $main::filesizelimit = 200; # limit of Kb per file (uncompressed) $main::use_only_cache = 0; #CGI $main::cgi = undef; $main::admincmd = undef; @main::sites_wanted = (); @main::cmdline_urls = (); %main::caches_to_rename = (); $main::keep_tmps = 0; $main::argv_levels = undef; $main::argv_storyurl = undef; $main::expiry_days = 7.0; $main::useragent = new ScoopHTTP::UserAgent; $main::useragent->env_proxy; $main::useragent->agent ("sitescooper/$main::VERSION ($main::home_url) ". $main::useragent->agent); &clear_page_tmpfiles; # -------------------------------------------------------------------------- if (defined $ENV{'REQUEST_METHOD'}) { # we're running from a CGI script, use CGI mode #$main::cgimode = 1; #$main::cgi = new CGI; } # This is the placeholder for development debug flags. # Add debugging stuff here, tagged with J M D (without the spaces ;). # -------------------------------------------------------------------------- if ($main::cgimode == 0) { while ($#ARGV >= 0) { $_ = shift; if (/^-debug$/) { $main::debug = 1; } elsif (/^-quiet$/) { $main::verbose = 0; } elsif (/^-refresh/) { $main::cached_front_page_lifetime = 0; $main::refresh = 1; #} elsif (/^-cache/) { #$main::just_caching = 1; # used for future parallelism } elsif (/^-dump/) { $main::dump = 1; $main::outstyle = $OUT_TEXT; } elsif (/^-dumpprc/) { $main::dumpprc = 1; } elsif (/^-doc/) { $main::outstyle = $OUT_DOC; $main::fileperpage = 0; $main::outputfilter = 'makedoc'; } elsif (/^-isilo/) { $main::outstyle = $OUT_HTML; $main::fileperpage = 0; $main::outputfilter = 'isilo'; } elsif (/^-misilo/) { $main::outstyle = $OUT_HTML; $main::fileperpage = 1; $main::outputfilter = 'isilo'; } elsif (/^-richreader/) { $main::outstyle = $OUT_HTML; $main::fileperpage = 0; $main::outputfilter = 'richreader'; } elsif (/^-text/) { $main::outstyle = $OUT_TEXT; $main::fileperpage = 0; $main::outputfilter = '__cat__'; } elsif (/^-html/) { $main::outstyle = $OUT_HTML; $main::fileperpage = 0; $main::outputfilter = '__cat__'; } elsif (/^-mhtml/) { $main::outstyle = $OUT_HTML; $main::fileperpage = 1; } elsif (/^-pipe/) { my $fmt = shift; my $cmd = shift; if ($fmt eq 'text') { $main::outstyle = $OUT_TEXT; $main::fileperpage = 0; } elsif ($fmt eq 'html') { $main::outstyle = $OUT_HTML; $main::fileperpage = 0; } elsif ($fmt eq 'mhtml') { $main::outstyle = $OUT_HTML; $main::fileperpage = 1; } else { &usage; } $main::outputfilter = 'cmd: '.$cmd; } elsif (/^-admin$/) { $main::admincmd = shift; } elsif (/^-nolinkrewrite/) { $main::nolinkrewrite = 1; } elsif (/^-fromcache/) { $main::use_only_cache = 1; } elsif (/^-limit/) { $main::filesizelimit = shift(@ARGV)+0; } elsif (/^-nodates/) { $main::nodates = 1; } elsif (/^-nowrite/) { $main::nowrite = 1; } elsif (/^-config/) { $config = shift; } elsif (/^-install/) { $pilotinstdir = shift; } elsif (/^-site/) { push (@sites_wanted, shift); } elsif (/^-levels/) { $argv_levels = shift()+0; } elsif (/^-storyurl/) { $argv_storyurl = shift; } elsif (/^-keep-tmps/) { $main::keep_tmps = 1; } elsif (/^-stdout-to/) { $_ = shift; close (STDOUT); open (STDOUT, ">> ".$_) or die "failed to redirect STDOUT to $_\n"; } elsif (/^-/) { &usage; } else { unshift @ARGV, $_; last; } } @main::cmdline_urls = @ARGV; $main::userid = $<; } else { # load some things from CGI parameters #CGI@main::cmdline_urls = ($main::cgi->param ('url')); #CGI$main::argv_levels = $main::cgi->param ('levels'); #CGI$main::argv_storyurl = $main::cgi->param ('storyurl'); #CGI@main::sites_wanted = $main::cgi->param ('sites'); #CGI$main::debug = $main::cgi->param ('debug'); #CGI$main::outstyle = $main::cgi->param ('outstyle'); #CGI$main::nowrite = $main::cgi->param ('nowrite'); #CGI$main::refresh = $main::cgi->param ('refresh'); #CGI$main::userid = $main::cgi->param ('userid'); #CGI&ScoopCGI::get_cookie; #CGI $main::password = $main::cgi->param ('password'); # REVISIT -- use a cookie to store userid and password #CGI$main::pilotinstdir = undef; } @conflines = (); if (defined $config) { open (IN, "< $config") || die "cannot read $config\n"; @conf = (); close IN; for ($i=0; $i<$#conf; $i++) { $conflines[$i] = "$config:".($i+1); } } else { @conf = split(/\n/, $CONFIG); for ($i=0; $i<$#conf; $i++) { $conflines[$i] = "(built-in):".($i+1); } } if ($main::debugdiffs) { &main::dbg ("debugging, will exit after diff"); } # -------------------------------------------------------------------------- # Andrew Fletcher : # A relative path on Mac seems to need a ":" before it. I've called # this $colon. $colon = ''; $slash = '/'; if (&Portability::MyOS eq 'Win32') { $slash = '\\'; } if (&Portability::MyOS eq 'Mac') { $slash = ':'; $colon = ':'; } $outdir = ''; %site_format = (); %links_start = %links_end = (); %links_limit_to = %story_limit_to = (); %links_print = (); %story_skip = %links_skip = (); %story_diff = %links_diff = (); %links_follow_links = %story_follow_links = (); %story_lifetime = (); %story_postproc = (); %url_postproc = (); %cacheable = (); # 0 = static, 1 = dynamic, undef = use heuristics %printable_sub = (); %head_pat = (); %levels = (); %use_table_smarts = (); %extra_urls = (); @sites = (); $url = ''; $sect = ''; $curkey = ''; $main::cached_front_page_lifetime /= (24*60); # convert to days %url_title = (); $sharedcache = undef; undef $tmpdir; if (&Portability::MyOS eq 'UNIX') { $tmpdir = $ENV{'HOME'}."/.sitescooper"; } $tmpdir ||= $ENV{'TMPDIR'}; $tmpdir ||= $ENV{'TEMP'}; $diff = 'diff'; if (&Portability::MyOS eq 'Win32') { $diff = "diff.exe"; } $makedoc = 'makedoc'; if (&Portability::MyOS eq 'Win32') { $makedoc = "makedocw.exe"; } $isilo = 'iSilo386'; $isiloargs = '-y'; $isilomultipageargs = '-d2'; if (&Portability::MyOS eq 'Win32') { $isilo = "iSiloW32.exe"; $isiloargs = '-u -y'; } $richreader = 'HTML2Doc'; $richargs = ''; if (&Portability::MyOS eq 'Win32') { $richreader = "HTML2Doc.exe"; $richargs = '-i'; } # Note that currently there is no HTML2Doc for UNIX platforms; it's # supported here anyway for future-proofing. $sitesdir = "sites"; if (!-d $sitesdir && &Portability::MyOS eq 'UNIX') { $sitesdir = $ENV{'HOME'}."/sites"; } # --------------------------------------------------------------------------- sub set_got_intr_behaviour { $got_intr_behaviour = shift; $got_intr_flag = 0; } sub got_intr { my $signame = shift; (&Portability::MyOS eq 'UNIX') and system ("stty echo"); if ($got_intr_behaviour eq 'exit') { die "got signal SIG$signame, exiting.\n"; } else { die "got signal SIG$signame, skipping site...\n"; $got_intr_flag = 1; } } &set_got_intr_behaviour ('exit'); $SIG{'INT'} = \&got_intr; $SIG{'TERM'} = \&got_intr; # --------------------------------------------------------------------------- if (!defined $pilotinstdir && !$main::cgimode) { @main::possible_inst_dirs = (); my $dir; if (&Portability::MyOS eq 'Win32') { eval ' sub get_instdir_wanted { return unless (/^install$/i && -d $File::Find::name); push (@main::possible_inst_dirs, $File::Find::name); } my $key = "HKEY_CURRENT_USER/Software/U.S. Robotics". "/Pilot Desktop/Core//Path"; if ($dir = $Registry->{$key}) { @main::possible_inst_dirs = (); find(\&get_instdir_wanted, $dir); } '; } elsif (defined $ENV{'HOME'}) { $dir = $ENV{'HOME'}."/pilot"; if (-d "$dir/install") { @main::possible_inst_dirs = ("$dir/install"); } if (-d $dir) { @main::possible_inst_dirs = ($dir); } } if ($#main::possible_inst_dirs == 0) { $pilotinstdir = $main::possible_inst_dirs[0]; } elsif ($#main::possible_inst_dirs > 0 && !&writing_text) { warn "Fatal: too many potential pilot PRC install directories, ". "please use '-install' argument.\n"; foreach $dir (@main::possible_inst_dirs) { warn "Possible choice: $dir\n"; } &cleanexit(1); } } &ReadSitesDir; # --------------------------------------------------------------------------- my $postproc = undef; my $postproctype = undef; my $proxyhost; my $proxyport = 80; my $confline; foreach $_ (@conf) { $confline = shift @conflines; s/#.*$//; s/^\s+//; s/\s+$//g; next if (/^$/); # process environment variable references: ${ENVVARNAME} # &main::dbg ("variable ref in site file: $1"); s/\$\{(\S+?)\}/ defined($ENV{$1}) ? $ENV{$1} : ""; /ge; if (defined $postproctype) { $postproc .= $_; # see if it's the end of the postproc statement scope $x = $postproc; 1 while ($x =~ s/\{[^\{\}]*\}//gs); #{ if ($x =~ /\}\s*$/) { if ($postproctype eq 'Story') { #{ $postproc =~ /^(.*)\}\s*$/; $story_postproc{$curkey} = $1; $postproc = undef; $postproctype = undef; } if ($postproctype eq 'URL') { #{ $postproc =~ /^(.*)\}\s*$/; $url_postproc{$curkey} = $1; $postproc = undef; $postproctype = undef; } } next; } s/^(\S+:)\s+/$1 /; # easier to read this way ;) /^ProxyHost: (.*)$/ and ($proxyhost = $1), next; /^ProxyPort: (.*)$/ and ($proxyport = $1+0), next; /^TmpDir: (.*)$/ and ($tmpdir = $1), next; if (/^SitesDir: (.*)$/) { $sitesdir = $1; &ReadSitesDir; next; } /^MakeDoc: (.*)$/ and ($makedoc = $1), next; /^iSilo: (.*)$/ and ($isilo = $1), next; /^HTML2Doc: (.*)$/ and ($richreader = $1), next; /^Diff: (.*)$/ and ($diff = $1), next; /^TextSaveDir: (.*)$/ and ($outdir = $1), next; /^PilotInstallDir: (.*)$/ and ($pilotinstdir = $1), next; /^SharedCacheDir: (.*)$/ and ($sharedcache = $1), next; /^ExpireCacheAfter: (.*)$/ and ($expiry_days = $1+0.0), next; if (/^URL: (.*)$/) { &FinishConfigSection ($sect, $url); $url = &expand_url_magic ($1); $sect = ''; if ($url !~ m,^(http|file)://,i) { $url = 'http://'.$url; } if ($url =~ m,(http|file)://[^/]+$,i) { $url .= '/'; } push (@sites, $url); &SetDefaultConfigForURL ($url); $site_defined_at{$url} = $confline; $curkey = $url; next; } if (!defined $curkey || $curkey eq '') { my $line = $confline; $line =~ s/^(.*):(.*?)$/"$1" line $2/g; die "Configuration line invalid (outside URL scope?) in $line:\n $_\n"; } /^Name: (.*)$/ and ($name{$curkey} = $1), next; /^Description: (.*)$/ and ($desc{$curkey} = $1), next; /^Active: (.*)$/ and ($active{$curkey} = $1+0), next; /^Levels: (.*)$/ and ($levels{$curkey} = $1-2), next; /^AddURL: (.*)$/ and ($extra_urls{$curkey} .= ' '.&expand_url_magic($1)), next; /^UseTableSmarts: (.*)$/ and ($use_table_smarts{$curkey} = $1+0), next; /^IssueLinksStart: (.*)$/ and ($links_start{"1 $curkey"} = $1), next; /^IssueLinksEnd: (.*)$/ and ($links_end{"1 $curkey"} = $1), next; /^IssuePrint: (.*)$/ and ($links_print{"1 $curkey"} = $1+0), next; /^IssueCache?able: (.*)$/ and ($cacheable{"1 $curkey"} = $1+0), next; /^IssueDiff: (.*)$/ and ($links_diff{"1 $curkey"} = $1+0), next; /^IssueUseTableSmarts: (.*)$/ and ($use_table_smarts{"1 $curkey"} = $1+0), next; /^IssueFollowLinks: (.*)$/ and ($links_follow_links{"1 $curkey"} = $1+0), next; # Normally Issue-level stuff is the highest level, so this would seem to # be irrelevant as we never would have to decide whether a URL is the # issues page since it's provided in the site file. However the # IssueFollowLinks parameter provides a need for this. if (/^IssueURL: (.*)$/) { my $pat = $1; if (!defined ($links_limit_to{"1 $curkey"})) { $links_limit_to{"1 $curkey"} = "($pat)"; } else { $links_limit_to{"1 $curkey"} =~ s/\)$/|$pat)/g; } next; } if (/^ContentsFormat: (.*)$/) { my $fmt = $1; if ($fmt eq 'rss') { # set up defaults for a Rich Site Summary site. # cf. http://my.netscape.com/publish/ $site_format{$url} = 'rss'; $links_start{"0 $curkey"} = '(proxy (['http', 'ftp'], "http://$proxyhost:$proxyport/"); } # --------------------------------------------------------------------------- # if ($just_caching) { # # just put the pages into the cache and forget about it # foreach $url (@main::cmdline_urls) { # &log ("bg: getting $url ..."); # &get_page ($url, 0); # } # &log ("bg: done."); # &cleanexit; # } if ($#main::cmdline_urls > -1) { @sites = (); foreach $url (@main::cmdline_urls) { # if it's a local file URL, switch around the slashes (for windows) if (&Portability::MyOS eq 'Win32' && $url =~ m,file:///,i) { $url =~ s/\\/\//g; } # REVISIT -- I don't know what to do in the same case for MacOS ;) if (-r $url) { if ($url =~ m,^/,) { $url = 'file://'.$url; } else { eval ' use Cwd; $url = "file://".getcwd."/".$url; 1;' or die ("eval failed: $@"); } } if ($url =~ m,(http|file)://[^/]+$,i) { $url .= '/'; } if (!defined $name{$url}) { $name{$url} = $url; if ($url =~ m,/([^/]+)$,) { $name{$url} = $1; } } $confline = "$url:0"; push (@sites, $url); &SetDefaultConfigForURL ($url); $site_defined_at{$url} = $confline; $story_lifetime{$url} = 0; # any age of story for command-line URLs if (defined $argv_levels) { $levels{$url} = $argv_levels-2; } if (defined $argv_storyurl) { $story_limit_to{$url} = $argv_storyurl; } } } # --------------------------------------------------------------------------- ($mday, $mon, $year, $monstr) = &get_date; if (!$nodates) { $date_for_filename = sprintf ("%04d_%02d_%02d_", $year, $mon, $mday); $title = "$year-".$monstr."-$mday: "; } else { $date_for_filename = ''; $title = ''; } %already_seen = (); %last_modtime = (); %main::oldest_already_seen = (); @seen_this_time = (); $main::failed_to_cvt = 0; $main::use_convert_tool = (!&writing_text); $main::warn_about_external_links = 0; # turned on where necessary &make_dirs; &generate_output_filenames (@sites); if (defined $main::admincmd) { if ($main::admincmd eq 'dump-sites') { while (($key,$outfile) = each %key2outfile) { my $url = $key2url{$key}; my $title = $key2title{$key}; $title =~ s,\t, ,g; $title =~ s,^\d+-\S+-\d+: ,,g; my $base = $key2tmp{$key}; $base =~ s,^.*${slash}(\S+?)\.tmp$,$1,o; my $site = $site_defined_at{$url}; $site =~ s/:\d+$//; $site =~ s/^.*${slash}(\S+?)$/$1/o; # foobar.site http://www.foobar.com/ Foo Bar 1999_01_01_Foo_Bar print "$site\t$url\t$title\t$base\n"; } exit; } elsif ($main::admincmd eq 'journal') { open (JOURNAL, "> $tmpdir${slash}journal") or die "cannot write to $tmpdir${slash}journal!\n"; } else { &usage; } } if ($main::use_convert_tool) { if (!-d $pilotinstdir) { mkdir ($pilotinstdir, 0755) || die "failed to mkdir '$pilotinstdir'\n"; } } &expire_old_cache_files; &ScoopHTTP::UserAgent::load_logins; &read_state; # to do all the conversions at the end: #&get_all_sites; #foreach $filekey (@main::filekeys) { &convert_output($filekey); } # to do them as each site is scooped: &get_all_sites (1); &write_state; &ScoopHTTP::UserAgent::save_logins; &verbose ("Finished!"); &cleanexit; # --------------------------------------------------------------------------- sub ReadSitesDir { my ($file, $key); my %sites_wanted = (); %read_sites = (); @site_files_to_read = (); if ($#sites_wanted >= 0) { &verbose ("Restricting to sites: ".join (' ', @sites_wanted)); foreach $key (@sites_wanted) { if (-r $key) { # it's a site file, not a name, add it to the list push (@site_files_to_read, $key); } $sites_wanted{$key} = 1; } } if ($#main::cmdline_urls >= 0) { # we're only snarfing the command-line URLs, skip the predefined sites return; } if (defined $sitesdir) { foreach $file (<${colon}$sitesdir${slash}*.site>) { next if ($file =~ /(\.swp$|core|\.bak$|\~$|^#)/); # skip backups next if (-d $file); # skip directories if ($#sites_wanted >= 0) { my $base = $file; $base =~ s,^.*[\/\\:]([^\/\\:]+)$,$1,g; &main::dbg ("checking if site file is wanted: $file"); next unless (defined $sites_wanted{$base} || defined $sites_wanted{$file}); } push (@site_files_to_read, $file); } } foreach $file (@site_files_to_read) { next if (defined $read_sites{$file}); $read_sites{$file} = 1; if (open (IN, "< $file")) { my $line = 0; while () { push (@conf, $_); push (@conflines, "$file:$line"); $line++; } close IN; &verbose ("Scooping site from file \"$file\"."); } else { &sitewarn_file_line ("$file:0", "Cannot read $file\n"); } } } # --------------------------------------------------------------------------- # Default configuration for a newly-specified URL. sub SetDefaultConfigForURL { my $url = shift; $sections{$url} = ""; # none yet $active{$url} = 1; # active by default $use_table_smarts{$url} = 1; # use smarts $levels{$url} = -1; # 1-level site $extra_urls{$url} = ''; # no extra URLs $story_lifetime{$url} = 90; # don't scoop stories older than 3 months # default limit to articles at the same site $url =~ m,^((http|file)://[^/]*/),i; if (defined $1) { $story_limit_to{$url} = $1.'.*'; } else { &sitewarn_file_line ($confline, "Unsupported URL protocol for URL '".$url."'.\n"); } } # --------------------------------------------------------------------------- # Incorporate defaults from the main URL into each Section. # sub FinishConfigSection { my $sect = shift; my $url = shift; if ($sect ne '') { if (!defined $name{$sect}) { $name{$sect} = $url; } if (!defined $desc{$sect}) { $desc{$sect} = $url; } if (!defined $story_start{$sect}) { $story_start{$sect} = $story_start{$url}; } if (!defined $story_end{$sect}) { $story_end{$sect} = $story_end{$url}; } if (!defined $head_pat{$sect}) { $head_pat{$sect} = $head_pat{$url}; } if (!defined $printable_sub{$sect}) { $printable_sub{$sect} = $printable_sub{$url}; } if (!defined $story_limit_to{$sect}) { $story_limit_to{$sect} = $story_limit_to{$url}; } if (!defined $story_skip{$sect}) { $story_skip{$sect} = $story_skip{$url}; } if (!defined $story_diff{$sect}) { $story_diff{$sect} = $story_diff{$url}; } if (!defined $story_follow_links{$sect}) { $story_follow_links{$sect} = $story_follow_links{$url}; } if (!defined $story_lifetime{$sect}) { $story_lifetime{$sect} = $story_lifetime{$url}; } if (!defined $active{$sect}) { $active{$sect} = $active{$url}; } # If the main site is disabled, so are the sub-sites. if ($active{$url} == 0) { $active{$sect} = 0; } $levels{$sect} = $levels{$url}; for ($lev = $levels{$url}; $lev >= 0; $lev--) { if (!defined $links_start{"$lev $sect"}) { $links_start{"$lev $sect"} = $links_start{"$lev $url"}; } if (!defined $links_end{"$lev $sect"}) { $links_end{"$lev $sect"} = $links_end{"$lev $url"}; } if (!defined $links_skip{"$lev $sect"}) { $links_skip{"$lev $sect"} = $links_skip{"$lev $url"}; } if (!defined $links_diff{"$lev $sect"}) { $links_diff{"$lev $sect"} = $links_diff{"$lev $url"}; } if (!defined $links_print{"$lev $sect"}) { $links_print{"$lev $sect"} = $links_print{"$lev $url"}; } if (!defined $links_follow_links{"$lev $sect"}) { $links_follow_links{"$lev $sect"} = $links_follow_links{"$lev $url"}; } if (!defined $links_limit_to{"$lev $sect"}) { $links_limit_to{"$lev $sect"} = $links_limit_to{"$lev $url"}; } } } } # --------------------------------------------------------------------------- sub make_dirs { if (!-d $tmpdir) { mkdir ($tmpdir, 0777) || die "failed to mkdir '$tmpdir'\n"; } chdir ($tmpdir) or die "cannot cd to $tmpdir\n"; $user_tmpdir = "$tmpdir${slash}sitescooper_$userid"; # passwords for sitescooper caches are not fully impled right now! # #if ($main::cgimode) { #open (PWD, "< $user_tmpdir${slash}passwd"); #my $pwd = ; close PWD; #my $salt = substr($pwd, 0, 2); #if (crypt ($main::password, $salt) ne $pwd) { #&ScoopCGI::passwd_failed; exit; #} #} if (!-d $user_tmpdir) { mkdir ($user_tmpdir, 0777) || die "failed to mkdir '$user_tmpdir'\n"; } if (!defined $pilotinstdir) { $pilotinstdir = "$user_tmpdir${slash}prc"; if (!-d $pilotinstdir) { mkdir ($pilotinstdir, 0777) || die "failed to mkdir '$pilotinstdir'\n"; } if (!$cgimode) { &verbose ("Warning: since no PilotInstallDir was specified". " in the configuration,\nI\'ll use $pilotinstdir .\n"); } } if ($main::debug) { open (LOGFILE, "> $user_tmpdir${slash}log.txt"); select LOGFILE; $| = 1; select STDOUT; } if ($outdir eq '') { $outdir = "$user_tmpdir${slash}txt"; } if (!-d $outdir) { mkdir ($outdir, 0777) || die "failed to mkdir '$outdir'\n"; } $cachedir = "$user_tmpdir${slash}cache"; $newcachedir = $cachedir; if (!-d $cachedir) { mkdir ($cachedir, 0777) || die "failed to mkdir '$cachedir'\n"; } if (defined $sharedcache) { if (!-d $sharedcache) { mkdir ($sharedcache, 0777) || die "failed to mkdir '$sharedcache'\n"; } } $alreadyseen = "$user_tmpdir${slash}already_seen.txt"; $newalreadyseen = $alreadyseen; if ($nowrite) { $newcachedir = "$user_tmpdir${slash}new_cache"; if (!-d $newcachedir) { mkdir ($newcachedir, 0777) || die "failed to mkdir '$newcachedir'\n"; } $newalreadyseen = "$user_tmpdir${slash}new_already_seen.txt"; } # check for spaces on Win32 -- MakeDocW can't handle them! # Thx to wgoosey /at/ servtech.com for spotting this one. if ($main::outputfilter eq 'makedoc') { if (&Portability::MyOS eq 'Win32') { if ($outdir =~ / /) { warn " Warning: Sitescooper is installed in a directory containing spaces in the filename. The MakeDocW conversion tool does not support this, so you may need to move Sitescooper to another directory, e.g. C:\\Sitescooper, for this conversion to work! (This is a bug in MakeDOCW.exe.) "; } } } } # --------------------------------------------------------------------------- sub expire_old_cache_files { if (defined $pilotinstdir) { sub expire_prcdir { unlink if (-f $_ && -M $_ > $main::expiry_days); } find(\&expire_prcdir, $pilotinstdir); } if (defined $outdir) { if (!$main::use_convert_tool) { sub expire_outdir { unlink if (-f $_ && -M $_ > $main::expiry_days); } find(\&expire_outdir, $outdir); } } sub expire_cache { unlink if (-f $_ && -M $_ > $main::expiry_days); } find(\&expire_cache, $cachedir); if (defined $sharedcache) { sub expire_shared_cache { unlink if (-f $_ && -M $_ > $main::expiry_days); } find(\&expire_shared_cache, $sharedcache); } } # --------------------------------------------------------------------------- sub read_state { if ($main::refresh == 0) { if (!open (IN, "< $alreadyseen")) { &verbose ("Cannot read $alreadyseen, creating a new one"); } else { my $url; my $mod; my $urlhost; while () { / lastmod=/; $url = $`; $mod = $'; $already_seen{$url} = 1; $last_modtime{$url} = $mod+0; m,http://(\S+?)/,; $urlhost = $1; if (defined($main::oldest_already_seen_this_site{$urlhost}) ? $main::oldest_already_seen_this_site{$urlhost} > $mod : 1) { $main::oldest_already_seen_this_site{$urlhost} = $mod; } } close IN; } } } # --------------------------------------------------------------------------- sub generate_output_filenames { my @sites = @_; my %already_done = (); foreach $site (@sites) { my @urls = ($site); if ($sections{$site} ne "") { @urls = split (/\|\|\|/, $sections{$site}); } foreach $url (@urls) { next if ($url eq ''); next unless ($active{$url} == 1); my $filekey = $site.$url; $sitename = $name{$site}; $sectname = ''; if ($site ne $url) { $sectname = "_".$name{$url}; } my $filedesc = $date_for_filename.$sitename.$sectname; $filedesc =~ s/[^-_A-Za-z0-9]+/_/g; $filedesc =~ s/_+$//; $filedesc =~ s/^_+//; if (&Portability::MyOS eq 'Mac') { # try to limit the filename to 32 characters $filedesc =~ s/^(.{32}).*$/$1/g; } $outidxfile = undef; if (&writing_html && $main::fileperpage) { $outfile = $outdir.$slash.$filedesc.'.pages'; $outidxfile = $filedesc.'.html'; } elsif (&writing_html) { $outfile = $outdir.$slash.$filedesc.'.html'; } else { $outfile = $outdir.$slash.$filedesc.'.txt'; } next if (defined $already_done{$outfile}); $already_done{$outfile} = 1; $tmpfile = $outdir.$slash.$filedesc.'.tmp'; $sectname =~ s/_+$//; $sectname =~ s/^_+//; my $secttitle = "${title}$sitename" . ($sectname ne '' ? ": $sectname" : ""); $main::key2tmp{$filekey} = $tmpfile; $main::key2outfile{$filekey} = $outfile; $main::key2outidxfile{$filekey} = $outidxfile; $main::key2title{$filekey} = $secttitle; $main::key2sitename{$filekey} = $sitename; $main::key2site{$filekey} = $site; $main::key2url{$filekey} = $url; push (@main::filekeys, $filekey); if ($main::dumpprc) { $main::key2syncfile{$filekey} = $tmpfile; # reuse it! } else { $main::key2syncfile{$filekey} = $pilotinstdir.$slash.$filedesc.'.prc'; } } } } # --------------------------------------------------------------------------- sub get_all_sites { my $convert_now = shift; if (!defined $convert_now) { $convert_now = 0; } foreach $filekey (@main::filekeys) { $outfile = $key2outfile{$filekey}; $tmpfile = $key2tmp{$filekey}; my $outidxfile = $key2outidxfile{$filekey}; my $secttitle = $key2title{$filekey}; my $sitename = $key2sitename{$filekey}; my $site = $key2site{$filekey}; my $url = $key2url{$filekey}; # This apparently is needed on MacOS. Sounds unlikely, but there it # is... if (&Portability::MyOS eq 'Mac') { eval ' use File::Basename; my $parentdir = dirname($outfile); if (!-d $parentdir) { mkdir ($parentdir, 0755); } 1;' or die ("eval failed: $@"); } &main::dbg ("tmp file: $tmpfile, out file: $outfile"); (-d $tmpfile) && &rmtree ($tmpfile); if ($main::outstyle == $OUT_HTML && $main::fileperpage) { mkdir ($tmpfile, 0755) || die "cannot mkdir $tmpfile\n"; &clear_page_tmpfiles; $main::output_filename = $tmpfile.$slash.$outidxfile; } else { $main::output_filename = $tmpfile; } $_ = $site_defined_at{$url}; /^(.*):/; &verbose ("Now scooping site \"$1\"."); $main::output_file = ''; %main::output_links_snarfed = (); %main::oldest_already_seen_this_site = (); $main::current_story_index = 0; @main::output_story_urls = (); if ($main::outstyle == $OUT_HTML) { $main::output_file .= "$secttitle". "

$secttitle


\n"; } else { $main::output_file .= "$secttitle\n\n\n"; } $stories_found = 0; $file_size = 0; $hit_file_size_limit = 0; &set_got_intr_behaviour ('setflag'); my $u; foreach $u ($url, split (' ', $extra_urls{$url})) { # if we were interrupted, clear the flag and go on if ($got_intr_flag) { &set_got_intr_behaviour ('setflag'); } if ($main::outstyle == $OUT_HTML && $main::fileperpage) { $page_to_tmpfile{$u} = $tmpfile.$slash.$outidxfile; } else { $page_to_tmpfile{$u} = $tmpfile; } if ($levels{$url} >= 0) { &download_front_page ($u, $url, $levels{$url}); } else { # just read the text and write it to a file &download_story_page ($u, $url, 1); } } &set_got_intr_behaviour ('exit'); if ($stories_found > 0) { &verbose ("$secttitle: $stories_found stories downloaded (". sprintf ("%3.1f", $file_size/1024)." K uncompressed)."); # convert sitescooper navigation links: [<-][->] { my $ind = $main::current_story_index; # trim off the first and last ones anyway $main::output_file =~ s/\[.*?<\/a>\]//g; # and run through the rest for ($i = 0; $i < $ind; $i++) { $main::output_file =~ s/\"__SITESCOOPER_STORY_${i}\"/\"${main::output_story_urls[$i]}\"/g; } } $main::output_file = &remove_external_links ($main::output_file); my $blurb1 = "(End of snarf - copyright retained by original ". "providers."; my $blurb2 = "Downloaded and converted by sitescooper; see ". "$main::home_url )"; if ($main::outstyle == $OUT_HTML) { $main::output_file .= "


$blurb1 $blurb2\n"; } elsif ($main::outstyle == $OUT_DOC) { $main::output_file .= "$blurb1 $blurb2\n<$main::bookmark_char>\n"; } else { $main::output_file .= "$blurb1\n$blurb2\n"; } open (OUTFILE, "> $main::output_filename") or die "Failed to create $main::output_filename\n"; print OUTFILE $main::output_file; close OUTFILE or warn "Failed to write to $tmpfile"; if (!$main::fileperpage) { if ($main::dump) { open (IN, "< $tmpfile"); while () { print STDOUT; } close IN; unlink ($tmpfile); # ensure we don't try to convert it later undef $main::key2syncfile{$filekey}; } else { unlink ($outfile); rename ($tmpfile, $outfile); if ($convert_now) { &convert_output($filekey); } } } else { &rmtree ($outfile); rename ($tmpfile, $outfile); if ($main::dump) { # print the path to the created directory containing the pages print $outfile."\n"; # ensure we don't try to convert it later undef $main::key2syncfile{$filekey}; } else { if ($convert_now) { &convert_output($filekey); } } } my ($from, $to); while (($from,$to) = each %main::oldest_already_seen_this_site) { &main::dbg ("Saving new already_seen age cache entry: $from => $to ". "(".&time2datestr($to).")"); $main::oldest_already_seen{$from} = $to; } } else { close OUTFILE; &verbose ("$secttitle: no new stories, ignoring."); &main::dbg ("(Not setting already_seen age cache since no links were followed)"); undef $main::key2syncfile{$filekey}; if ($main::fileperpage) { &rmtree ($tmpfile); } else { unlink $tmpfile; } } $main::output_file = ''; } } # --------------------------------------------------------------------------- sub convert_output { my $filekey = shift; return unless ($main::use_convert_tool); my $syncfile = $key2syncfile{$filekey}; return unless defined $syncfile; $outfile = $key2outfile{$filekey}; my $outidxfile = $key2outidxfile{$filekey}; my $secttitle = $key2title{$filekey}; unlink $syncfile; if ($main::outputfilter eq '__cat__') { open (IN, "< ".$outfile); while () { print STDOUT; } close IN; unlink $outfile; return; } if ($main::outputfilter eq 'makedoc') { $cmd = "$makedoc $outfile ".$syncfile." '".$secttitle."'"; } elsif ($main::outputfilter eq 'isilo') { if ($main::fileperpage) { $cmd = "$isilo $isiloargs $isilomultipageargs ". $outfile.$slash.$outidxfile; } else { $cmd = "$isilo $isiloargs ".$outfile; } # UNIX iSilo utils take the output filename as well; Win32 # doesn't need it as it installs as it goes along. if (&Portability::MyOS ne 'Win32') { $cmd .= " ".$syncfile; } # Win32 iSilo only takes the -u arg for the GUI version, not the # command line one. Strip the arg for the command-line converter. if (&Portability::MyOS eq 'Win32') { if ($cmd =~ /isiloc32/i) { $cmd =~ s/ -u / /g; } } } elsif ($main::outputfilter eq 'richreader') { $cmd = "$richreader $richargs ".$outfile; # UNIX HTML2Doc utils take the output filename as well; Win32 # doesn't need it as it installs as it goes along. if (&Portability::MyOS ne 'Win32') { $cmd .= " ".$syncfile; } } elsif ($main::outputfilter =~ /^cmd: /) { $cmd = $'; $cmd =~ s/__SCOOPFILE__/${outfile}/g; $cmd =~ s/__SYNCFILE__/${syncfile}/g; $cmd =~ s/__TITLE__/${secttitle}/g; } else { die "bad output filter $main::outputfilter\n"; } if (&Portability::MyOS ne 'Mac') { &add_cmd_dir_to_path ($cmd); &verbose ("Running: $cmd"); system $cmd; # output a newline, MakeDoc won't do it itself. if (&Portability::MyOS eq 'UNIX') { &verbose ("\n"); } if (($? >> 8) != 0) { # work around a bug in iSilo converter on Win32 -- it # reports failure even when the conversion went fine. if (&Portability::MyOS ne 'Win32' || $main::outputfilter ne 'isilo') { warn "command failed: $cmd\n"; $main::failed_to_cvt = 1; } } } else { # system is broken on MacOS, so print the required command #so it can be run easily from MPW shell if (!defined $macos_system_warning_written) { warn "[Warning: not using the broken MacPerl system() call. ". "You will need to\ncut and paste the command ". "lines yourself!]\n\n"; $macos_system_warning_written = 1; } print $cmd, "\n"; } if (!$main::debug && !$main::keep_tmps) { &rmtree ($outfile); # don't keep .txt files around } # If we're dumping, read in the generated file and write it to # STDOUT. if ($main::dumpprc) { open (IN, "< ".$syncfile); while () { print STDOUT; } close IN; unlink $syncfile; } # output the name of the finished file. This is handy for scripts # which want to collect these files and store them somewhere. # REVISIT -- verify that Win32 iSilo uses the same filename. if (!$main::dumpprc) { print "Created: ".$syncfile."\n"; } } sub add_cmd_dir_to_path { local ($_); my $cmd = shift; # Perl on some Win32 platforms seems to require that the binary be # in the PATH. # if (&Portability::MyOS eq 'Win32') { $_ = $cmd; if (!/[\\\/:]/) { return; } # foo arg ... if (/^\"([^\"]+)\"/) { $cmd = $1; } # "C:\Program Files\foo.exe" arg ... elsif (/^(\S+)\s/) { $cmd = $1; } # C:\windows\foo.exe arg ... else { $cmd = $_; } # C:\windows\foo.exe $cmd =~ s,[\\/][^\\/]+\s*$,,g; # trim the filename $cmdpat = $cmd; $cmdpat =~ s,(\W),\\$1,g; # escape funny chars if ($ENV{'PATH'} !~ /;${cmdpat}(;|$)/) { &main::dbg ("Adding directory to command path: $cmd"); my $path = $ENV{'PATH'} || $ENV{'Path'} || $ENV{'path'}; $path .= ";$cmd"; $ENV{'PATH'} = $path; } } } # --------------------------------------------------------------------------- sub write_state { if (!$main::failed_to_cvt) { # only write alreadyseen if the files converted successfully, otherwise # the user may lose some recent news due to a makedoc screwup. # my $towrite = ''; my $now = time; my $twomonthsago = $now - (24*60*60*30*2); my $mod; my $urlhost; # keep the already-seen list small by cutting out old entries. We # define "old entries" as (a) older than 2 months and (b) older than # the oldest link we saw in today's scooping run. # if (!$main::refresh) { &main::dbg ("trying to cut old entries from already-seen URL cache"); foreach $_ (keys %already_seen) { m,http://(\S+?)/,; $urlhost = $1; if (defined $last_modtime{$_} && defined $main::oldest_already_seen{$urlhost}) { $mod = $last_modtime{$_}; if ($twomonthsago > $mod && $main::oldest_already_seen{$urlhost} > $mod) { &main::dbg ("stripping old entry: $_ lastmod=$mod (".&time2datestr($mod).")"); next; } } $towrite .= $_." lastmod=".(defined $last_modtime{$_} ? $last_modtime{$_} : $now)."\n"; } if (open (OUT, "> $newalreadyseen")) { print OUT $towrite; # do it as one big atomic write, for safety close OUT || warn "Cannot rewrite $newalreadyseen\n"; } else { warn "Cannot rewrite $newalreadyseen\n"; } } else { # it's small enough -- so we can just append to it. &main::dbg ("appending already-seen URLs to $newalreadyseen"); foreach $_ (@seen_this_time) { $towrite .= $_." lastmod=".(defined $last_modtime{$_} ? $last_modtime{$_} : $now)."\n"; } if (open (OUT, ">> $newalreadyseen")) { print OUT $towrite; # do it as one big atomic write, for safety close OUT || warn "Cannot append to $newalreadyseen\n"; } else { warn "Cannot append to $newalreadyseen\n"; } } my ($from, $to); while (($from,$to) = each %caches_to_rename) { &main::dbg ("Saving new cache file: $to"); rename ($from, $to) or warn ("rename $from -> $to failed\n"); } } } # --------------------------------------------------------------------------- # Note on levels: a 2-level site has a contents page and stories off that; # 3-level has issue links page, per-issue contents page and stories. # 1-level has only the story page, no links. sub download_front_page { my $url = shift; my $baseurl = shift; my $level = shift; my ($cachefile, $page); my $key = "$level $baseurl"; $sitewarn_current_site_line = $site_defined_at{$baseurl}; # Use this hash to avoid endless loops when scooping multi-page front pages. return if (defined $already_seen_this_session{$url}); $already_seen_this_session{$url} = 1; if (defined $links_limit_to{$key}) { if (!match_url ($url, $links_limit_to{$key})) { &dbg ("front page URL $url does not match ".${links_limit_to{$key}}.", ignoring."); return; } } if (defined $links_skip{$key}) { if ($url =~ m#^${links_skip{$key}}$#) { &verbose ("Skipping: $url"); return; } } my $origurl = $url; $url = &apply_url_postproc($url, $baseurl); if (!defined $url) { &main::dbg ("URLProcess says URL should be ignored: $origurl"); return; } &verbose ("Reading level-".($level+2)." front page: $url"); &set_got_intr_behaviour ('setflag'); my $is_dynamic_html; if (defined $cacheable{$key}) { $is_dynamic_html = ($cacheable{$key} == 0); } elsif (defined $links_diff{$key} && $links_diff{$key} != 0) { $is_dynamic_html = 1; # pages that need diff'ing are dynamic } elsif ($level < $levels{$baseurl}) { # second-level or deeper front pages are usually not dynamic, more # likely to be a static table of contents. $is_dynamic_html = 0; } else { $is_dynamic_html = 1; # front pages are usually dynamic } push (@seen_this_time, $url); $already_seen {$url} = 1; &check_for_oldest ($url); # we came across the link, so keep it around $page = &get_page ($url, $is_dynamic_html); return unless defined $page; if ($got_intr_flag) { goto interrupted; } if (defined $last_modtime{$url} && $last_modtime{$url} < $story_lifetime{$baseurl} * 24 * 60 * 60) { &verbose ("Skipping (contents are older than ". $story_lifetime{$baseurl}." days): $url"); return; } my $origpage = $page; &journal ("pre_strip_level".($level+2), $page); $page = &strip_front_page ($url, $key, $baseurl, $page); &journal ("post_strip_level".($level+2), $page); my $cachedpage; if (defined $links_diff{$key} && $links_diff{$key} != 0) { $cachedpage = &strip_front_page ($url, $key, $baseurl, &get_cached_page_for_diff ($url)); $page = &get_new_bits ($cachedpage, $page); &cache_page_later ($url, $origpage); } else { &cache_page_later ($url, $origpage); } if (defined fileno JOURNAL) { # always write a text-mode version for the journal &journal ("to_text_level".($level+2), &html_to_text ($url, $baseurl, $page, $OUT_TEXT)); } if ((defined $links_print{$key} && $links_print{$key} != 0) || &writing_html) { $main::warn_about_external_links = 1; my $txtpage = &html_to_text ($url, $baseurl, $page, $main::outstyle); $main::warn_about_external_links = 0; my $outme = 1; if ($is_dynamic_html && defined $cachedpage && !$main::refresh) { # ensure that the cleaned-up HTML doesn't match the cleaned-up cached # HTML. Sometimes the ad banners will be the only things that have # changed between retrieves, and html_to_text will have stripped those # out. my $cachedtxt = &html_to_text ($url, $baseurl, $cachedpage, $main::outstyle); if (&text_equals ($txtpage, $cachedtxt)) { &verbose ("Not printing contents (text has not changed): $url"); $outme = 0; } } if ($outme) { &verbose ("Printing: $url"); &write_as_story (1, $url, $baseurl, $txtpage, undef); } if (&writing_html && (!defined $links_print{$key} || $links_print{$key} == 0)) { # don't count the front page as a story if we're just outputting it # because we're writing HTML. $stories_found--; } } # see if there's any links to extra contents pages my @turnoverlinks = &get_contents_turnover_links ($url, $key, $page); my @links = (); my $wrote_sep = 0; # This was all getting a bit tricky, so I've redone it a bit. # It now does not try to strip closing tags, as it doesn't have to. while (1) { if ($got_intr_flag) { goto interrupted; } if ( $page =~ s/]*href=\"([^\"%>]+)\"//is || $page =~ s/]*href=([^\s>]+)//is ) { push (@links, $1); next; } # support for frames if ( $page =~ s/]*src=\"([^\">]+)\"//is || $page =~ s/]*src=([^\s+>]+)//is ) { my $link = $1; if (&writing_html) { if ($wrote_sep == 0) { $main::output_file .= "


\n"; $wrote_sep = 1; } $main::output_file .= &translate_link ($url, $baseurl, $link, $link). "
\n"; } push (@links, $link); next; } # rudimentary support for My-Netscape-style RDF files if ($page =~ s/(.*?)]*>(.+?)<\/link>(.*?)<\/item>//is) { my ($title, $link, $title2) = ($1, $2, $3); # tags in RSS can contain other crap. Ditch it; we want the link! $link =~ s/^.*(.*?)<\/url>.*$/$1/gis; $link = &AbsoluteURL ($url, $link); if ($title =~ /(.*?)<\/title>/is || $title =~ /<text>(.*?)<\/text>/is || $title2 =~ /<title>(.*?)<\/title>/is || $title2 =~ /<text>(.*?)<\/text>/is) { $url_title{$link} = $1; } push (@links, $link); next; } last; # no more links available } if ($#links >= 0) { &verbose ("Found ".($#links+1)." links, examining them."); } # now traverse the links and get the stories &journal ("links_level".($level+2), join ("\n", @links)); foreach $_ (@links) { if ($hit_file_size_limit) { my $msg = "File size limit of $main::filesizelimit K exceeded,". " skipped some stories from this site."; &verbose ($msg); if (&writing_html) { $main::output_file .= "<hr><i>$msg</i><br>\n"; } else { $main::output_file .= "\n($msg)\n"; } last; } &follow_front_link ($baseurl, $url, $level, $_); if ($got_intr_flag) { goto interrupted; } } # if there's more contents pages, process them as well. &journal ("turnover_links_level".($level+2), join ("\n", @turnoverlinks)); if ($#turnoverlinks >= 0) { my $link; for $link (@turnoverlinks) { if ($got_intr_flag) { goto interrupted; } $link = &AbsoluteURL ($url, $link); &download_front_page ($link, $baseurl, $level); } } interrupted: &set_got_intr_behaviour ('exit'); } # --------------------------------------------------------------------------- sub follow_front_link { my ($baseurl, $url, $level, $nextpage) = @_; $nextpage = &AbsoluteURL ($url, $nextpage); &main::dbg ("Link found on $baseurl: $nextpage"); # should we download the next front page? if (defined $links_start{($level-1)." $baseurl"}) { &download_front_page ($nextpage, $baseurl, $level-1); return; } if ($got_intr_flag) { return; } # nope, we're onto the stories already $nextpage = &make_printable ($baseurl, $nextpage, 1); &download_story_page ($nextpage, $baseurl, 0); } sub make_printable { my $baseurl = shift; my $nextpage = shift; my $warn_if_fail = shift; if (defined $printable_sub{$baseurl}) { my $new = $nextpage; my $sub = $printable_sub{$baseurl}; $sub =~ s/\\(\d+)/\$$1/g; # avoid warnings eval '$new =~ '.$sub.'; 1;' or &sitewarn ("Printable substitution failed! ($!)\n"); if ($nextpage ne $new) { # &verbose ("Using printable version instead: $new"); if (defined $story_limit_to{$baseurl} && !match_url ($new, $story_limit_to{$baseurl})) { if ($warn_if_fail) { &sitewarn ("Printable version does not match StoryURL". "pattern, reverting to $nextpage\n"); } } else { $nextpage = $new; } } } $nextpage; } # --------------------------------------------------------------------------- sub download_story_page { my $url = shift; my $baseurl = shift; my $is_dynamic_html = shift; my ($cachefile, $page); $url =~ s/#.*$//g; # strip references study $url; $sitewarn_current_site_line = $site_defined_at{$baseurl}; my $cacheflag = $cacheable{"s $baseurl"}; if (defined $cacheflag) { # user setting overrides our heuristics $is_dynamic_html = ($cacheflag==0); } if (defined $story_diff{$baseurl} && $story_diff{$baseurl}) { $is_dynamic_html = 1; # diff pages are always dynamic } if (defined $story_limit_to{$baseurl}) { if (!defined $output_storyurl_dbg{$baseurl}) { &main::dbg ("StoryURL for $baseurl: ${story_limit_to{$baseurl}}"); $output_storyurl_dbg{$baseurl} = 1; } if (!match_url ($url, $story_limit_to{$baseurl})) { &main::dbg ("Non-story URL ignored: $url"); return; } } if ($url =~ m,^(ftp|mailto|https|gopher|pnm)://,) { &main::dbg ("Non-story URL ignored: $url"); return; } my $origurl = $url; $url = &apply_url_postproc($url, $baseurl); if (!defined $url) { &main::dbg ("URLProcess says URL should be ignored: $origurl"); return; } if (defined $story_skip{$baseurl}) { if ($url =~ m#^${story_skip{$baseurl}}$#) { &verbose ("Skipping: $url"); return; } } &check_for_oldest ($url); if (!$is_dynamic_html && $already_seen {$url}) { &main::dbg ("skipping, already seen: $url"); return; } push (@seen_this_time, $url); $already_seen {$url} = 1; &get_story_page ($url, $baseurl, $is_dynamic_html); } # --------------------------------------------------------------------------- sub get_story_page { my $url = shift; my $baseurl = shift; my $is_dynamic_html = shift; my @turnoverlinks; my $headline; &verbose ("Reading: $url"); &check_for_oldest ($url); # we came across the link, so keep it around my $cachedpage = undef; if (defined $story_diff{$baseurl} && $story_diff{$baseurl}) { $cachedpage = &get_cached_page_for_diff ($url); } elsif ($is_dynamic_html) { $cachedpage = &get_cached_page ($url); } if (defined $cachedpage) { $cachedpage = &strip_story ($url, $baseurl, $cachedpage, " (cached)"); } my $origpage = &get_page ($url, $is_dynamic_html); return unless defined $origpage; if ($got_intr_flag) { return; } # get headline before stripping StoryStart and StoryEnd $headline = &get_headline ($url, $baseurl, $origpage); &journal ("pre_strip_story", $origpage); my $page = &strip_story ($url, $baseurl, $origpage, ""); &journal ("post_strip_story", $page); if (defined $story_diff{$baseurl} && $story_diff{$baseurl}) { $page = &get_new_bits ($cachedpage, $page); &cache_page_later ($url, $origpage); } else { &cache_page ($url, $origpage); } if ($got_intr_flag) { return; } if (defined fileno JOURNAL) { # always write a text-mode version for the journal &journal ("to_text_story", &html_to_text ($url, $baseurl, $page, $OUT_TEXT)); } # get turn-over links after stripping StoryStart and StoryEnd @turnoverlinks = &get_story_turnover_links ($url, $baseurl, $page); $main::warn_about_external_links = 1; $page = &html_to_text ($url, $baseurl, $page, $main::outstyle); $main::warn_about_external_links = 0; if ($is_dynamic_html && defined $cachedpage && !$main::refresh) { # ensure that the cleaned-up HTML doesn't match the cleaned-up cached # HTML. Sometimes the ad banners will be the only things that have # changed between retrieves, and html_to_text will have stripped those # out. $cachedpage = &html_to_text ($url, $baseurl, $cachedpage, $main::outstyle); if (&text_equals ($page, $cachedpage)) { &verbose ("Skipping (text has not changed): $url"); return; } } if (defined $last_modtime{$url} && $last_modtime{$url} < $story_lifetime{$baseurl} * 24 * 60 * 60) { &verbose ("Skipping (story is older than ". $story_lifetime{$baseurl}." days): $url"); return; } # ensure there's some alphanumerics in the output text. No alnums means # no output. HTML needs to be checked to ensure we don't just pick # up tags, which will not be displayed. if ((&writing_html && $page !~ /[A-Za-z0-9]\s*</ && $page !~ />\s*[A-Za-z0-9]/ && $page !~ /^\s*[A-Za-z0-9]/) || (!&writing_html && $page !~ /[A-Za-z0-9]/)) { &verbose ("Skipping (no text to write): $url"); return; } if ($levels{$baseurl} < 0) { # this is a one-level site: therefore the story should be treated # as the "front page". Thx Carsten for this one. &write_as_story (1, $url, $baseurl, $page, $headline); } else { &write_as_story (0, $url, $baseurl, $page, $headline); } &journal ("turnover_links_story", join ("\n", @turnoverlinks)); if ($#turnoverlinks >= 0) { my $link; for $link (@turnoverlinks) { if ($got_intr_flag) { return; } $link = &AbsoluteURL ($url, $link); &download_story_page ($link, $baseurl, 0); # right now } } } # --------------------------------------------------------------------------- sub apply_url_postproc { local ($_) = shift; my $baseurl = shift; if (defined $url_postproc{$baseurl}) { if (!eval $url_postproc{$baseurl}."; 1;") { &sitewarn("URLProcess failed: $@"); undef $_; } } $_; } # --------------------------------------------------------------------------- sub clean_pre_tags_for_diff { my $file = shift; my $pre_nl_tag = shift; my $pre_pre_tag = shift; my $pre_slashpre_tag = shift; my $start = ''; my $end = ''; ($file =~ s/^(.*)<pre>//i) and $start = $1; ($file =~ s/<\/pre>(.*)$//i) and $end = $1; $file =~ s/\n/${pre_nl_tag}/gs; $start.$pre_pre_tag.$file.$pre_slashpre_tag.$end; } sub get_new_bits { local ($_); my ($oldfile, $newfile) = @_; if (!defined $oldfile || $oldfile =~ /^\s*$/) { if (!$main::debugdiffs) { return $newfile; } $oldfile = ''; } if ($main::refresh) { &verbose ("-refresh is on, not looking for differences"); return $newfile; } &verbose ("Finding differences between current page and cached version"); # it's important to keep these names 8.3 for Windows-95 compatibility, # as some Windoze diffs may not be able to handle them otherwise! # This also requires that we are chdir'd into the temporary directory # to avoid hassles with long filenames in the args when we run the # diff command. What a pain! # my $oldf = "a$$.tmp"; # we are already chdir'ed my $newf = "b$$.tmp"; if ($main::debugdiffs) { $oldf = "diff_old.tmp"; $newf = "diff_new.tmp"; } open (F1, "> $oldf") || warn "cannot write to $oldf\n"; open (F2, "> $newf") || warn "cannot write to $newf\n"; # Split the file lines at probable story-header endpoints. # This makes them more amenable to diffing, hopefully without # losing bits we don't want to lose, or gaining bits we don't # want to gain. Also try to keep cross-line-split HTML tags # together. # preserve newlines in <pre> text my $cleaned_pre_nls = 0; my $pre_nl_tag = "<!!!n>"; my $pre_pre_tag = "<!!!pre>"; my $pre_slashpre_tag = "<!!!/pre>"; while ($oldfile =~ /<pre>/i) { $oldfile = &clean_pre_tags_for_diff ($oldfile, $pre_nl_tag, $pre_pre_tag, $pre_slashpre_tag); $cleaned_pre_nls = 1; } while ($newfile =~ /<pre>/i) { $newfile = &clean_pre_tags_for_diff ($newfile, $pre_nl_tag, $pre_pre_tag, $pre_slashpre_tag); $cleaned_pre_nls = 1; } # canonicalise all other newlines (we control the vertical!) $oldfile =~ s/\s*[\r\n]+\s*/ /gs; $newfile =~ s/\s*[\r\n]+\s*/ /gs; # remove extraneous whitespace from inside tags $oldfile =~ s/<\s*([^>]+?)\s*>/ $_=$1; s,\s+, ,gs; "<$_>"; /gies; $newfile =~ s/<\s*([^>]+?)\s*>/ $_=$1; s,\s+, ,gs; "<$_>"; /gies; # handle the two types of <p> tags -- <p>...</p>, and just ...<p> $oldfile =~ s/<p( *[^>]*>.*?<\/p *[^>]*>)/\n<!!!p$1\n/gi; $newfile =~ s/<p( *[^>]*>.*?<\/p *[^>]*>)/\n<!!!p$1\n/gi; $oldfile =~ s/(<p *[^>]*>)/$1\n/gi; $newfile =~ s/(<p *[^>]*>)/$1\n/gi; $oldfile =~ s/<!!!p/<p/gi; $newfile =~ s/<!!!p/<p/gi; # put newline before these tags (thx Carsten Clasohm, again!) $oldfile =~ s/(<(?:table|tr|td|div|item) *[^>]*>)/\n$1/gi; $newfile =~ s/(<(?:table|tr|td|div|item) *[^>]*>)/\n$1/gi; # after these ones $oldfile =~ s/(<(?:br|hr|table|\/td|\/table|\/tr|\/div) *[^>]*>)/$1\n/gi; $newfile =~ s/(<(?:br|hr|table|\/td|\/table|\/tr|\/div) *[^>]*>)/$1\n/gi; # remove newlines inside <a href> tags. Thx to Carsten Clasohm. 1 while $oldfile =~ s/(<a href=[^>]+>([^\n<]|<(?!\/a>))*)\n+/$1 /gis; 1 while $newfile =~ s/(<a href=[^>]+>([^\n<]|<(?!\/a>))*)\n+/$1 /gis; if ($cleaned_pre_nls) { $oldfile =~ s/${pre_nl_tag}/\n/g; $oldfile =~ s/${pre_pre_tag}/<pre>/g; $oldfile =~ s/${pre_slashpre_tag}/<\/pre>/g; $newfile =~ s/${pre_nl_tag}/\n/g; $newfile =~ s/${pre_pre_tag}/<pre>/g; $newfile =~ s/${pre_slashpre_tag}/<\/pre>/g; } print F1 $oldfile; close F1; print F2 $newfile; close F2; my $page = ''; open (DIFF, "$diff $oldf $newf |") || warn "cannot run $diff\n"; while (<DIFF>) { /^>/ || next; $page .= $'; } close DIFF; # ignore exit status -- exit 1 only means no diffs. if ($main::debugdiffs) { open (F1, "> diff_out.tmp"); print F1 $page; close F1; warn "$diff $oldf $newf, breaking for debug"; &cleanexit; } unlink $oldf; unlink $newf; $page; } # --------------------------------------------------------------------------- sub text_equals { my $t1 = shift; my $t2 = shift; $t1 =~ s/[\s\r\n]+/ /gs; $t1 =~ s/^\s+//; $t1 =~ s/\s+$//; $t2 =~ s/[\s\r\n]+/ /gs; $t2 =~ s/^\s+//; $t2 =~ s/\s+$//; ($t1 eq $t2); } # --------------------------------------------------------------------------- # Strip a story page from StoryStart to StoryEnd. # In addition, strip out non-story sidebar table items # and carriage returns (they confuse plenty of regexps later). # sub strip_story { my $url = shift; my $baseurl = shift; my $page = shift; my $comment = shift; if (!defined $page) { return undef; } # ok, now strip the headers and footers my $pat = $story_start{$baseurl}; if (defined $pat) { if ($page =~ /${pat}.*${pat}/) { &sitewarn("StoryStart pattern \"$pat\" found multiple times in page $url$comment\n"); } if ($page =~ s#^.*?${pat}##gs) { if (defined fileno JOURNAL) { &journal ("pre_stripped", $&); } $page =~ s#^[^<]*?>##gs; # strip superfluous ends of tags if (defined fileno JOURNAL) { &journal ("pre_stripped", $&); } } else { &sitewarn("StoryStart pattern \"$pat\" not found in page $url$comment\n"); } } $pat = $story_end{$baseurl}; if (defined $pat) { if ($page =~ /${pat}.*${pat}/) { &sitewarn("StoryEnd pattern \"$pat\" found multiple times in page $url$comment\n"); } if ($page =~ s#${pat}.*?$##gs) { if (defined fileno JOURNAL) { &journal ("post_stripped", $&); } $page =~ s#<[^>]*?$##gs; # strip superfluous starts of tags if (defined fileno JOURNAL) { &journal ("post_stripped", $&); } } else { &sitewarn("StoryEnd pattern \"$pat\" not found in page $url$comment\n"); } } # &smart_clean_table only operates on table items with size specifications. # TODO -- this does not handle <td width=100>...<td>foo</td><td>bar</td></td> # $page =~ s/<td\s+([^>]+)>(.*?)<\/td>/ &smart_clean_table ($baseurl, $1, $2, $baseurl); /gies; $page =~ s/\r/ /g; # strip CRs $page; } sub strip_front_page { my $url = shift; my $key = shift; my $baseurl = shift; my $page = shift; if (!defined $page) { return undef; } my $pat = $links_start{$key}; if (defined $pat) { if ($page =~ /${pat}.*${pat}/) { &sitewarn("ContentsStart pattern \"$pat\" found multiple times in page $url\n"); } ($page =~ s#^.*?${pat}##gs) || &sitewarn("ContentsStart pattern \"$pat\" not found in page $url\n"); $page =~ s#^[^<]*?>##gs; # strip cut-in-half tags } $pat = $links_end{$key}; if (defined $pat) { if ($page =~ /${pat}.*${pat}/) { &sitewarn("ContentsEnd pattern \"$pat\" found multiple times in page $url\n"); } ($page =~ s#${pat}.*?$##gs) || &sitewarn("ContentsEnd pattern \"$pat\" not found in page $url\n"); $page =~ s#<[^>]*?$##gs; # strip cut-in-half tags } $page =~ s/<td\s+([^>]+)>(.*?)<\/td>/ &smart_clean_table($baseurl, $1, $2, $key); /gies; $page =~ s/\r/ /g; # strip CRs $page; } # --------------------------------------------------------------------------- sub get_headline { my $url = shift; my $baseurl = shift; my $page = shift; my $headline; if (defined $url_title{$url}) { $headline = &html_to_text ($url, $baseurl, $url_title{$url}, $OUT_TEXT); &main::dbg ("StoryHeadline: (from RDF): $headline"); } elsif (defined $head_pat{$baseurl}) { my $pat = $head_pat{$baseurl}; if ($page !~ m#${pat}#m) { &sitewarn("StoryHeadline pattern \"$pat\" not found in page $url\n"); } elsif (defined $1) { $headline = &html_to_text ($url, $baseurl, $1, $OUT_TEXT); # &main::dbg ("StoryHeadline: $headline"); # logged later on anyway } else { &sitewarn("StoryHeadline pattern \"$pat\" does not contain brackets!\n"); } } elsif ($page =~ m#<meta name="PCTITLE" content="(.*)">#mi) { # try a fallback: search for PointCast headline tags $headline = &html_to_text ($url, $baseurl, $1, $OUT_TEXT); &main::dbg ("StoryHeadline (default, PointCast): $headline"); } $headline; } # --------------------------------------------------------------------------- sub get_story_turnover_links { my $url = shift; my $baseurl = shift; my $page = shift; my @turnoverlinks = (); while ($page =~ s,<a\s+[^>]*href=(?:\"|%22)?([^>]+)(?:\"|%22)?>(.+?)</a>,,i) { my $link = $1; my $txt = $2; if (defined $story_follow_links {$baseurl} && $story_follow_links {$baseurl}) { push (@turnoverlinks, $link); } elsif ($txt =~ m,(more|next|\d+ of \d+|>>),i) { my $urlguts = '.'; ($baseurl =~ /^http:\/\/\S+\.([^\.\/]+\.[^\.\/]+\/).*$/) and ($urlguts = $1); if (($txt !~ /[a-z0-9] [a-z0-9]+ [a-z0-9]+ [a-z0-9]/i) # 5 or more words && (length ($txt) < 15) && $link =~ m/$urlguts/) { push (@turnoverlinks, $link); $txt =~ s/[\n\r]+/ /g; &verbose ("(Following 'next page' link: \"$txt\")"); } } } @turnoverlinks; } # --------------------------------------------------------------------------- sub get_contents_turnover_links { my $url = shift; my $key = shift; my $page = shift; if (!defined $links_follow_links{$key} || !$links_follow_links{$key}) { return (); } my @turnoverlinks = (); while ($page =~ s,<a\s+[^>]*href=(?:\"|%22)?([^>]+)(?:\"|%22)?>(.+?)</a>,,i) { my $link = $1; my $txt = $2; if (defined $links_follow_links {$key} && $links_follow_links {$key}) { push (@turnoverlinks, $link); } # we don't do the automatic "more/next/page x of y" stuff # that we do with the story pages } @turnoverlinks; } # --------------------------------------------------------------------------- sub remove_an_ext_link { my ($link, $text, $ahref, $posthref) = @_; if (!&writing_html) { return $text; } if (defined ($main::output_links_snarfed{$link}) || $main::nolinkrewrite) { $ahref.$link.$posthref.$text."</a>"; } else { &dbg ("Removing non-snarfed link: $link"); $text; # without <a href=...> </a> } } sub remove_external_links { local ($_) = $_[0]; # &dbg (join(' ', sort keys %main::output_links_snarfed)); s/(<a\s+[^>]*href=\")([^\"]+)(\"[^>]*?>)(.*?)<\/a>/ &remove_an_ext_link ($2, $4, $1, $3); /gies; $_; } # We could do this smarter, but it looks really gross when converted to # DOC format -- and this tool is primarily for that conversion. Sorry! # This also works well for iSilo, because iSilo's rendering of <pre> text # is pretty rotten. # sub clean_preformatted_text { my $txt = shift; $txt =~ s/[ \t]+\n/\n/g; $txt =~ s/<(|\/)(pre|code)>//g; # strip extra <pre> tags! # convert blank lines to a paragraph separator. $txt =~ s/\n{1,}\n/<p>\n\n/g; # The idea with this one is to add a <br> at the end of lines shorter # than 50 columns, and conversely to allow lines longer than 50 cols to # run into the next line as if they were part of a paragraph. I'm not # sure about it, but a lot of <pre> sites are just copies of emails, so # it can make them look a lot better, since the Palm's screen is a # lot narrower than 80 columns (which is what most <pre> pages aim for). # REVISIT - Make this a .site file parameter? $txt =~ s/\n\s*(.+[<>].+)\s*\n/<br>\n$1<br>\n/g; $txt =~ s/\n\s*([^\n]{1,50})\s*\n/\n$1<br>\n/g; $txt =~ s/[ \t]+/ /g; $txt; } # Work out if we should strip table items based on their size -- well, # their width at least. # sub smart_clean_table { my $baseurl = $_[0]; my $contents = $_[2]; my $key = $_[3]; if ($use_table_smarts{$key}) { $_ = " $_[1] "; s/\s+/ /g; s/ = /=/g; s/"//g; #my $replace = ' '; #if ($main::debug) { #$replace = "[table item \<td$_\> omitted]\n"; #} if (/ width=(\d+) /i) { if ($1+0 < 250) { return ' '; } } elsif (/ width=(\d+)% /i) { if ($1+0 < 40) { return ' '; } } } $contents; } sub translate_link { my ($url, $baseurl, $link, $text, $ahref, $posthref) = @_; if (!&writing_html) { return $text; } if (!defined $ahref) { $ahref = "<a href="; } if (!defined $posthref) { $posthref = ">"; } $link = &AbsoluteURL ($url, $link); if ($main::nolinkrewrite) { return $ahref."\"".$link."\"".$posthref.$text."</a>"; } # translate to printable version first, in case the StoryURL pattern # only covers the printable style. $link = &make_printable ($baseurl, $link, 0); # Is the link one that we will be downloading? If not, just de-linkify # it. 1-level sites never have active links so we can just assume # the links should not be links. if (!match_url ($link, $story_limit_to{$baseurl}) || $levels{$baseurl} < 0) { # check the contents/issue levels as well. my $ok = 0; for ($lev = $levels{$baseurl}; $lev >= 0; $lev--) { my $key = "$lev $baseurl"; if (defined $links_limit_to{$key}) { if (!match_url ($link, $links_limit_to{$key})) { $ok = 1; last; } } } if ($ok == 0 && $link ne $baseurl) { if ($main::warn_about_external_links) { &main::dbg ("External link not translated: $link"); } # return $text; # without <a href=...> </a> return "<u>".$text."</u>"; } } #&main::dbg ("Translating link: $link"); # Note that we always put in quotes around the URL. # &remove_external_links, which is run later, requires this (and anyway # it makes for better HTML). # if ($main::fileperpage) { my ($fname, $relative) = &get_page_tmpfile ($link); $ahref."\"".$relative."\"".$posthref.$text."</a>"; } else { my $anchor = $link; $anchor =~ s/[^-_A-Za-z0-9]/_/g; $ahref."\"#".$anchor."\"".$posthref.$text."</a>"; } } sub html_to_text { my $url = shift; my $baseurl = shift; my $page = shift; my $format = shift; if ((defined $site_format{$baseurl}) && ($site_format{$baseurl} eq 'rss')) { # Convert the RSS formatting into a nice display, for the index page. $page =~ s,<channel>(.*?)<title>(.*?)<\/title>(.*?)<\/channel>,<h2>$2<\/h2> $1 $3,gis; my $link; $page =~ s/<link>(.*?)<\/link>/ $link = $1; $link =~ s,^.*<url>(.*)<\/url>.*$,$1,g; $link = &AbsoluteURL($url,$link); '(<a href='.$link.'>'.$link.'<\/a>)'; /gies; $page =~ s,<title>(.*?)<\/title>,<b>$1<\/b> ,gis; $page =~ s,<item>,<p>,gis; $page =~ s,<\/item>,<\/p>,gis; # the description is converted for RSS 0.91 sites -- the "fat" format $page =~ s,<description>(.*?)<\/description>,$1 ,gis; } # strip tags we know we don't want $page =~ s/<form(\s+[^>]+|)>.*?<\/form>/ /gis; $page =~ s/<style(\s+[^>]+|)>.*?<\/style>/ /gis; # not yet $page =~ s/<image(\s+[^>]+|)>.*?<\/image>/ /gis; # RDF tag $page =~ s/<channel(\s+[^>]+|)>.*?<\/channel>/ /gis; # RDF tag $page =~ s/<script(\s+[^>]+|)>.*?<\/script>/ /gis; $page =~ s/<map(\s+[^>]+|)>.*?<\/map>/ /gis; $page =~ s/<head(\s+[^>]+|)>.*?<\/head>/ /gis; $page =~ s/<iframe(\s+[^>]+|)>.*?<\/iframe>/ /gis; $page =~ s/<ilayer(\s+[^>]+|)>.*?<\/ilayer>/ /gis; $page =~ s/<layer(\s+[^>]+|)>.*?<\/layer>/ /gis; $page =~ s/<frame(\s+[^>]+|)>/ /gis; $page =~ s/<applet(\s+[^>]+|)>.*?<\/applet>/ /gis; $page =~ s/<item(\s+[^>]+|)>.*?<\/item>/ /gis; # some RDF items $page =~ s/<link(\s+[^>]+|)>.*?<\/link>/ /gis; # some RDF items $page =~ s/<title(\s+[^>]+|)>.*?<\/title>/ /gis; # some RDF items $page =~ s/<img\s+[^>]*src=[^>]+>/ /gis; # inline images $page =~ s/<!--.*?-->/ /gis; # MSIE-style comments $page =~ s/<!--.*?>/ /gis; # Netscape-style comments # Clean up some mishaps that can occur with diffed pages and # multi-line-spanning formatting tags! # $page =~ s,<(b)>(.*?)<(/b)>,<!!!$1>$2<!!!$3>,gis; $page =~ s,<(i)>(.*?)<(/i)>,<!!!$1>$2<!!!$3>,gis; $page =~ s,<(strong)>(.*?)<(/strong)>,<!!!$1>$2<!!!$3>,gis; $page =~ s,<(small)>(.*?)<(/small)>,<!!!$1>$2<!!!$3>,gis; $page =~ s,<(big)>(.*?)<(/big)>,<!!!$1>$2<!!!$3>,gis; $page =~ s,<(a\s+[^>]+)>(.*?)<(/a)>,<!!!$1>$2<!!!$3>,gis; $page =~ s,<(font\s+[^>]+)>(.*?)<(/font)>,<!!!$1>$2<!!!$3>,gis; $page =~ s,<(code(?:\s+[^>]+|))>(.*?)<(/code)>,<!!!$1>$2<!!!$3>,gis; $page =~ s,<(pre(?:\s+[^>]+|))>(.*?)<(/pre)>,<!!!$1>$2<!!!$3>,gis; $page =~ s,<(h\d(?:\s+[^>]+|))>(.*?)<(/h\d)>,<!!!$1>$2<!!!$3>,gis; # any tags left are stray tags with no opposite tag, so strip 'em. $page =~ s,<(|/)(a|b|i|small|big|strong|font|code|pre|h\d)(?:\s+[^>]+|)>, ,gis; $page =~ s,<!!!,<,g; # and bring it back to normal # convert <pre> text to proper HTML, it displays better. $page =~ s/<pre>(.*?)<\/pre>/&clean_preformatted_text($1);/gies; $page =~ s/<code>(.*?)<\/code>/&clean_preformatted_text($1);/gies; # strip all existing line breaks, they will just confuse matters # when we convert to text or HTML. It's also easier to do proper diffs # when we control the placement of newlines. $page =~ s/[\r\n]+/ /gs; if ($format == $OUT_DOC) { # Create DOC bookmarks at <a name> tags # From Brian Lalor <blalor@hcirisc.cs.binghamton.edu> # via Christopher Heschong's <chris@screwdriver.net> # webpage-to-prc converter. Nice one lads, good trick! $page =~ s/<a name.*?>/$main::bookmark_char /gis; } if ($format == $OUT_HTML) { # note the conversion of href= to href!!!=. This stops the second # substitution from screwing up the output of the first one! $page =~ s/(<a\s+[^>]*href)=(?:\"|%22)([^\">]+)(?:\"|%22)([^>]*?>)(.*?)<\/a>/ &translate_link ($url, $baseurl, $2, $4, $1.'!!!=', $3); /gies; $page =~ s/(<a\s+[^>]*href)=([^>\s\n]+)([^>]*>)(.*?)<\/a>/ &translate_link ($url, $baseurl, $2, $4, $1.'!!!=', $3); /gies; $page =~ s/href!!!=/href=/gis; # back to normal # This works around a bug (I think) in iSilo that makes Wired News # indexes look rotten. Shouldn't be harmful anyway. $page =~ s/<br>\s*\ \s*<br>/<br><br>/gis; # since we're rendering to HTML, line breaks are OK. Put them back in! $page =~ s,(<(br|p|hr|table|td|/td|/table|/p|/tr|/h\d|/div)\s*[^>]*>),$1\n,gis; # ...but clean up useless tags and whitespace at the start and end # of the text. $page =~ s,^(<(br|hr|/td|/table|/p|/tr|/h\d|/div)\s*[^>]*>|\s+),,gis; $page =~ s,^(<(br|p|hr|td|table|tr|h\d|div)\s*[^>]*>|\s+),,gis; } if ($format == $OUT_DOC || $format == $OUT_TEXT) { # We're converting to DOC or text format, so we need to do a lot # more work here. # a sidebar enclosed by a table? separate it from the rest of the text. $page =~ s/<\/tr>/\n\n/gis; $page =~ s/<\/table>/\n\n/gis; # end of <table> $page =~ s/<\/pre>/\n\n/gis; # end of <pre> text $page =~ s/<(\/h\d|h\d)(\s+[^>]+|)>/\n\n/gis; # headings $page =~ s/<\/?blockquote(\s+[^>]+|)>/\n\n/gis; # quotes $page =~ s/<hr(\s+[^>]+?|)>/\n\n/gis; # horiz lines $page =~ s/<br(\s+[^>]+?|)>/\n/gis; # end-of-line markers $page =~ s/<li(\s+[^>]+?|)>/\n/gis; # list items $page =~ s/<\/?p(\s+[^>]+?|)>/\n\n/gis; # don't worry, multiple blank lines are sorted later $page =~ s/<\/td>/ /gis; # end-of-table-item 1 while ($page =~ s/<[^>]+?>//gs); # trim all other tags # do a few escapes here -- the commonly used ones, as they are most # likely to show up capitalised where they shouldn't be. # The spec seems confused on this point. $page =~ s/\ / /gi; $page =~ s/\&/\&/gi; $page =~ s/\"/\"/gi; $page =~ s/\©/(c)/gi; $page =~ s/\®/(r)/gi; $page =~ s/\</</gi; $page =~ s/\>/>/gi; $page = &remove_html_escapes ($page); # sort the lot of 'em out # trim multiple (blank) bookmarks $page =~ s/($main::bookmark_char\s+){2,}/$main::bookmark_char /gs; } $page =~ s/[ \t]+/ /g; # canonicalise down to one space $page =~ s/\n /\n/gs; # leading w/s on each line $page =~ s/\n{3,}/\n\n/gs; # too many blank lines $page =~ s/^\s+//gs; # blank space at start of story $page =~ s/\s+$//gs; # blank space at end of story $page; } sub remove_html_escapes { # Convert special HTML characters # This code was shamelessly stolen from # http://pilot.screwdriver.net/convert.pl.txt , # Christopher Heschong's <chris@screwdriver.net> webpage-to-pilot # conversion tool. (whoops, reinvented the wheel again --j. ;) # # In turn, he credits the following folks: # From: "Yannick Bergeron" <bergery@videotron.ca> # And Especially: Sam Denton <Sam.Denton@maryville.com> # # Cheers lads! my $page = shift; my %escapes = ( # first, the big four HTML escapes 'quot', '"', # quote 'amp', '&', # ampersand 'lt', '<', # less than 'gt', '>', # greater than # Sam got most of the following HTML 4.0 names from # http://spectra.eng.hawaii.edu/~msmith/ASICs/HTML/Style/allChar.htm 'emsp', "\x80", # em space (HTML 2.0) 'sbquo', "\x82", # single low-9 (bottom) quotation mark (U+201A) 'fnof', "\x83", # Florin or Guilder (currency) (U+0192) 'bdquo', "\x84", # double low-9 (bottom) quotation mark (U+201E) 'hellip', "\x85", # horizontal ellipsis (U+2026) 'dagger', "\x86", # dagger (U+2020) 'Dagger', "\x87", # double dagger (U+2021) 'circ', "\x88", # modifier letter circumflex accent 'permil', "\x89", # per mill sign (U+2030) 'Scaron', "\x8A", # latin capital letter S with caron (U+0160) 'lsaquo', "\x8B", # left single angle quotation mark (U+2039) 'OElig', "\x8C", # latin capital ligature OE (U+0152) 'diams', "\x8D", # diamond suit (U+2666) 'clubs', "\x8E", # club suit (U+2663) 'hearts', "\x8F", # heart suit (U+2665) 'spades', "\x90", # spade suit (U+2660) 'lsquo', "\x91", # left single quotation mark (U+2018) 'rsquo', "\x92", # right single quotation mark (U+2019) 'ldquo', "\x93", # left double quotation mark (U+201C) 'rdquo', "\x94", # right double quotation mark (U+201D) 'endash', "\x96", # dash the width of ensp (Lynx) 'ndash', "\x96", # dash the width of ensp (HTML 2.0) 'emdash', "\x97", # dash the width of emsp (Lynx) 'mdash', "\x97", # dash the width of emsp (HTML 2.0) 'tilde', "\x98", # small tilde 'trade', "\x99", # trademark sign (HTML 2.0) 'scaron', "\x9A", # latin small letter s with caron (U+0161) 'rsaquo', "\x9B", # right single angle quotation mark (U+203A) 'oelig', "\x9C", # latin small ligature oe (U+0153) 'Yuml', "\x9F", # latin capital letter Y with diaeresis (U+0178) 'ensp', "\xA0", # en space (HTML 2.0) 'thinsp', "\xA0", # thin space (Lynx) # from this point on, we're all (but 2) HTML 2.0 'nbsp', "\xA0", # non breaking space 'iexcl', "\xA1", # inverted exclamation mark 'cent', "\xA2", # cent (currency) 'pound', "\xA3", # pound sterling (currency) 'curren', "\xA4", # general currency sign (currency) 'yen', "\xA5", # yen (currency) 'brkbar', "\xA6", # broken vertical bar (Lynx) 'brvbar', "\xA6", # broken vertical bar 'sect', "\xA7", # section sign 'die', "\xA8", # spacing dieresis (Lynx) 'uml', "\xA8", # spacing dieresis 'copy', "\xA9", # copyright sign 'ordf', "\xAA", # feminine ordinal indicator 'laquo', "\xAB", # angle quotation mark, left 'not', "\xAC", # negation sign 'shy', "\xAD", # soft hyphen 'reg', "\xAE", # circled R registered sign 'hibar', "\xAF", # spacing macron (Lynx) 'macr', "\xAF", # spacing macron 'deg', "\xB0", # degree sign 'plusmn', "\xB1", # plus-or-minus sign 'sup2', "\xB2", # superscript 2 'sup3', "\xB3", # superscript 3 'acute', "\xB4", # spacing acute 'micro', "\xB5", # micro sign 'para', "\xB6", # paragraph sign 'middot', "\xB7", # middle dot 'cedil', "\xB8", # spacing cedilla 'sup1', "\xB9", # superscript 1 'ordm', "\xBA", # masculine ordinal indicator 'raquo', "\xBB", # angle quotation mark, right 'frac14', "\xBC", # fraction 1/4 'frac12', "\xBD", # fraction 1/2 'frac34', "\xBE", # fraction 3/4 'iquest', "\xBF", # inverted question mark 'Agrave', "\xC0", # capital A, grave accent 'Aacute', "\xC1", # capital A, acute accent 'Acirc', "\xC2", # capital A, circumflex accent 'Atilde', "\xC3", # capital A, tilde 'Auml', "\xC4", # capital A, dieresis or umlaut mark 'Aring', "\xC5", # capital A, ring 'AElig', "\xC6", # capital AE diphthong (ligature) 'Ccedil', "\xC7", # capital C, cedilla 'Egrave', "\xC8", # capital E, grave accent 'Eacute', "\xC9", # capital E, acute accent 'Ecirc', "\xCA", # capital E, circumflex accent 'Euml', "\xCB", # capital E, dieresis or umlaut mark 'Igrave', "\xCC", # capital I, grave accent 'Iacute', "\xCD", # capital I, acute accent 'Icirc', "\xCE", # capital I, circumflex accent 'Iuml', "\xCF", # capital I, dieresis or umlaut mark 'Dstrok', "\xD0", # capital Eth, Icelandic (Lynx) 'ETH', "\xD0", # capital Eth, Icelandic 'Ntilde', "\xD1", # capital N, tilde 'Ograve', "\xD2", # capital O, grave accent 'Oacute', "\xD3", # capital O, acute accent 'Ocirc', "\xD4", # capital O, circumflex accent 'Otilde', "\xD5", # capital O, tilde 'Ouml', "\xD6", # capital O, dieresis or umlaut mark 'times', "\xD7", # multiplication sign 'Oslash', "\xD8", # capital O, slash 'Ugrave', "\xD9", # capital U, grave accent 'Uacute', "\xDA", # capital U, acute accent 'Ucirc', "\xDB", # capital U, circumflex accent 'Uuml', "\xDC", # capital U, dieresis or umlaut mark 'Yacute', "\xDD", # capital Y, acute accent 'THORN', "\xDE", # capital THORN, Icelandic 'szlig', "\xDF", # small sharp s, German (sz ligature) 'agrave', "\xE0", # small a, grave accent 'aacute', "\xE1", # small a, acute accent 'acirc', "\xE2", # small a, circumflex accent 'atilde', "\xE3", # small a, tilde 'auml', "\xE4", # small a, dieresis or umlaut mark 'aring', "\xE5", # small a, ring 'aelig', "\xE6", # small ae diphthong (ligature) 'ccedil', "\xE7", # small c, cedilla 'egrave', "\xE8", # small e, grave accent 'eacute', "\xE9", # small e, acute accent 'ecirc', "\xEA", # small e, circumflex accent 'euml', "\xEB", # small e, dieresis or umlaut mark 'igrave', "\xEC", # small i, grave accent 'iacute', "\xED", # small i, acute accent 'icirc', "\xEE", # small i, circumflex accent 'iuml', "\xEF", # small i, dieresis or umlaut mark 'dstrok', "\xF0", # small eth, Icelandic (Lynx) 'eth', "\xF0", # small eth, Icelandic 'ntilde', "\xF1", # small n, tilde 'ograve', "\xF2", # small o, grave accent 'oacute', "\xF3", # small o, acute accent 'ocirc', "\xF4", # small o, circumflex accent 'otilde', "\xF5", # small o, tilde 'ouml', "\xF6", # small o, dieresis or umlaut mark 'divide', "\xF7", # division sign 'oslash', "\xF8", # small o, slash 'ugrave', "\xF9", # small u, grave accent 'uacute', "\xFA", # small u, acute accent 'ucirc', "\xFB", # small u, circumflex accent 'uuml', "\xFC", # small u, dieresis or umlaut mark 'yacute', "\xFD", # small y, acute accent 'thorn', "\xFE", # small thorn, Icelandic 'yuml', "\xFF", # small y, dieresis or umlaut mark ); foreach $_ (32..126, 160..255) { $escapes{'#'.$_} = pack('c',$_); } $page =~ s/&((\w*)|(\#\d+));/$escapes{$1} || $&/egi; $page; } # --------------------------------------------------------------------------- sub check_for_oldest { my $url = shift; $url =~ m,http://(\S+?)/,; my $urlhost = $1; #&main::dbg ("checking to see if $url is oldest at its site: modtime=". #(defined $last_modtime{$url} ? $last_modtime{$url} : "unknown)")); if (defined $last_modtime{$url}) { if (defined($main::oldest_already_seen_this_site{$urlhost}) ? $main::oldest_already_seen_this_site{$urlhost} > $last_modtime{$url} : 1) { &main::dbg ("oldest link seen at $urlhost $url: modtime=".$last_modtime{$url}); $main::oldest_already_seen_this_site{$urlhost} = $last_modtime{$url}; } } } # --------------------------------------------------------------------------- sub cachefilename { my $url = shift; my $cachefile; undef $cachefile; if (defined $cachedir) { $cachefile = $url; $cachefile =~ s/[^-_A-Za-z0-9]/_/g; $cachefile = $cachedir.$slash.$cachefile; } } sub sharedcachefilename { my $url = shift; my $cachefile; undef $cachefile; if (defined $sharedcache) { $cachefile = $url; $cachefile =~ s/[^-_A-Za-z0-9]/_/g; $cachefile = $sharedcache.$slash.$cachefile; } } sub newcachefilename { my $url = shift; my $cachefile; undef $cachefile; if (defined $newcachedir) { $cachefile = $url; $cachefile =~ s/[^-_A-Za-z0-9]/_/g; $cachefile = $newcachedir.$slash.$cachefile; } } sub get_cached_page { my $url = shift; my $is_diff_page = shift; my $cachefile = &cachefilename ($url); if (!defined $cachefile) { return undef; } # if -refresh is on, do not return any cached pages. if ($main::refresh) { return undef; } if (open (IN, "< $cachefile")) { binmode IN; my $cachedpage = join ('', <IN>); close IN; $cachedpage; } else { undef; } } sub get_cached_page_for_diff { &get_cached_page (@_, 1); } sub get_page { my $url = shift; my $is_dynamic_html = shift; my $page = ''; my $cachefile = &cachefilename ($url); my $cachedpage = &get_cached_page ($url); &check_for_oldest ($url); my $lastmod; if (defined $cachefile && defined $cachedpage) { if ($is_dynamic_html == 0) { &main::dbg("cached version exists"); return $cachedpage; } elsif (defined (-M $cachefile) && -M _ < $main::cached_front_page_lifetime && -M _ > 0) # just make sure the clock is sane { &main::dbg("cached version is new enough: ".(-M $cachefile)." days"); return $cachedpage; } elsif ($main::use_only_cache) { &main::dbg("-fromcache switch is on, using cached version"); return $cachedpage; } } # see if we have it in the shared cache if (defined $sharedcache) { $cachedpage = undef; $cachefile = &sharedcachefilename ($url); if (defined $cachefile && (open (IN, "< $cachefile"))) { binmode IN; $cachedpage = join ("", <IN>); close IN; if ($cachedpage =~ s/^<!-- lastmod: (\d+) -->//) { $lastmod = $1+0; } } if (defined $cachefile && defined $cachedpage) { if ($is_dynamic_html == 0) { &main::dbg("shared-cache version exists"); if (defined $lastmod) { $last_modtime{$url} = $lastmod; &check_for_oldest ($url); } return $cachedpage; } elsif (defined (-M $cachefile) && -M _ < $main::cached_front_page_lifetime && -M _ > 0) { &main::dbg("shared-cache version is new enough: ".(-M $cachefile)." days"); if (defined $lastmod) { $last_modtime{$url} = $lastmod; &check_for_oldest ($url); } return $cachedpage; } elsif ($main::use_only_cache) { &main::dbg("-fromcache switch is on, using shared-cache version"); if (defined $lastmod) { $last_modtime{$url} = $lastmod; &check_for_oldest ($url); } return $cachedpage; } } undef $cachedpage; # if it didn't pass those tests, don't keep it! } if ($main::use_only_cache) { &main::dbg("-fromcache switch is on, not doing HTTP request"); return undef; } if ($got_intr_flag) { return undef; } use HTTP::Request::Common; my $req = new HTTP::Request ('GET', $url); # REVISIT - support POST $req->header ("Accept-Language" => "en", "Accept-Charset" => "iso-8859-1,*,utf-8"); my $resp = undef; $cmd = '$resp = $main::useragent->request ($req);'; my $timeout = 10; # minutes # REVISIT -- implement timeout for Win32 perl if (&Portability::MyOS eq 'UNIX') { eval ' local $SIG{"ALRM"} = sub { die "alarm\n" }; alarm $timeout*60; { ' . $cmd. ' } alarm 0; '; } else { eval $cmd; } die if $@ && $@ ne "alarm\n"; if ($@) { &sitewarn ("HTTP GET timed out, $timeout minutes without a response."); &got_intr; } if ($got_intr_flag) { return undef; } if (!$resp->is_success) { &sitewarn ("HTTP GET failed: ".$resp->status_line." ($url)"); return undef; } $page = $resp->content; # REVISIT - use $resp->base as new base url? if (defined $resp->last_modified) { # this seems to be obsolete -- last-modified dates are now being # returned as UNIX time_t's. Nice one! # #use HTTP::Date; #$lastmod = undef; # protect against a nasty die in Time::Local::timegm(). #my $x = $SIG{__DIE__}; $SIG{__DIE__} = 'warn'; #eval { #$lastmod = str2time ($resp->last_modified); #}; #$SIG{__DIE__} = $x; # a bit absurd all that, really $lastmod = $resp->last_modified; &main::dbg ("last-modified time for $url: $lastmod (".&time2datestr($lastmod).")"); if (defined $last_modtime{$url} && defined($lastmod) && $lastmod <= $last_modtime{$url} && !$main::refresh) { &verbose ("Skipping (no mod since last download): $url"); $last_modtime{$url} = $lastmod+0; &check_for_oldest ($url); return undef; } } else { &main::dbg ("last-modified time for $url: not provided"); $lastmod = time; } $last_modtime{$url} = $lastmod; &check_for_oldest ($url); if ($is_dynamic_html && defined $cachedpage && $cachedpage eq $page && !$main::refresh) { &verbose ("Skipping (HTML has not changed): $url"); return undef; } $page; } # --------------------------------------------------------------------------- sub cache_page { my ($url, $page, $cachelater) = @_; my $cachefile = &newcachefilename ($url); # if this page is the latest version of a diffed page, don't cache it # immediately, as it will mean lost stories if we're interrupted. # Instead save the filename for renaming when the run finishes. if (defined $cachelater && $cachelater == 1) { my $tmpname = $cachefile.'.tmp'; $main::caches_to_rename{$tmpname} = $cachefile; $cachefile = $tmpname; } open (C1OUT, "> $cachefile"); binmode C1OUT; print C1OUT $page; close C1OUT; if (defined $sharedcache) { $cachefile = &sharedcachefilename ($url); open (C2OUT, "> $cachefile"); binmode C2OUT; if (defined $last_modtime {$url}) { # cache the last-modified time of this page as well. print C2OUT "<!-- lastmod: ",$last_modtime{$url}," -->\n"; } print C2OUT $page; close C2OUT; } $page; } sub cache_page_later { &cache_page ($_[0], $_[1], 1); } # --------------------------------------------------------------------------- sub write_as_story { local ($_); my ($is_front, $url, $baseurl, $page, $headline) = @_; my $sitename = $name{$baseurl}; if (defined $story_postproc{$baseurl}) { my $bookmark_char = $main::bookmark_char; # convenience for PostProc $_ = $page; if (!eval $story_postproc{$baseurl}."; 1;") { &sitewarn("StoryPostProc failed: $@"); # and keep the original $page } else { $page = $_; } } my $outtext = ''; my $one_page_anchor; if (&writing_html) { $one_page_anchor = $url; $one_page_anchor =~ s/[^-_A-Za-z0-9]/_/g; $outtext .= "<hr><i>$sitename: $url</i><br><a name=\"$one_page_anchor\">\n" . "[<a href=\"__SITESCOOPER_STORY_" . ($main::current_story_index-1)."\"><<</a>]" . "[<a href=\"__SITESCOOPER_STORY_" . ($main::current_story_index+1)."\">>></a>]<br>" . $page; } else { $outtext .= "------------\n$sitename: $url\n\n"; if (&writing_doc) { if (defined $headline) { &verbose ("(Headline: $headline)"); $outtext .= "$main::bookmark_char $headline\n"; } else { # use the first line in the story instead $outtext .= "$main::bookmark_char "; } } foreach $_ (split (/\n/, $page)) { if (&writing_text) { # wrap each line after 70 columns while (s/^(.{70}\S*)\s+//) { $outtext .= $1."\n"; } } $outtext .= $_."\n"; } $outtext .= "\n\n\n"; } if ($main::fileperpage) { if ($is_front) { # this is the front page, just append it to the index file $main::output_file .= $outtext; } else { my ($fname, $relative) = &get_page_tmpfile ($url); open (PAGEFILE, ">> $fname") or die "failed to write to $fname\n"; if (&writing_html) { print PAGEFILE "<html><head></head><body>", $outtext, "</body></html>"; # ensure we have links to the stories from the index page! if ($main::output_file !~ /href=\"?$relative/) { if (!defined $headline) { $headline = '(no headline found)'; } $main::output_file .= "<a href=$relative>$headline</a><br>\n"; } $main::output_links_snarfed{$relative} = 1; $main::output_story_urls[$main::current_story_index] = $relative; } else { print PAGEFILE $outtext; } close PAGEFILE; } } else { $main::output_file .= $outtext; $main::output_links_snarfed{'#'.$one_page_anchor} = 1; $main::output_story_urls[$main::current_story_index] = '#'.$one_page_anchor; } $main::current_story_index++; $main::output_links_snarfed{$url} = 1; $file_size += length($outtext); &dbg ("story written, ". ($file_size/1024)." Kb, limit is ". $main::filesizelimit." Kb"); if ($file_size/1024 >= $main::filesizelimit) { $hit_file_size_limit = 1; } $stories_found++; } # --------------------------------------------------------------------------- sub get_page_tmpfile { local ($_); my $url = shift; if (!defined $page_to_tmpfile{$url}) { $_ = $url; s/[^-_A-Za-z0-9]+/_/g; $page_to_tmpfile{$url} = $tmpfile.$slash.$_; while (-r $page_to_tmpfile{$url}) { $page_to_tmpfile{$url} .= (time % 10); } $page_to_tmpfile{$url} .= '.html'; } &main::dbg ("page file for $url: $page_to_tmpfile{$url}"); $page_to_tmpfile{$url} =~ /[\\\/\:]([-_\.A-Za-z0-9]+)$/; ($page_to_tmpfile{$url}, $1); } sub clear_page_tmpfiles { %page_to_tmpfile = (); } # --------------------------------------------------------------------------- sub warn_log { my $msg = join ('', @_); chomp $msg; &log ("Warning: ", $msg); print STDERR @_; } sub die_log { my $msg = join ('', @_); chomp $msg; &log ("Fatal: ", $msg); print STDERR @_; &cleanexit(2); } sub log { if (defined fileno LOGFILE) { print LOGFILE @_, "\n"; } } sub journal { if (defined fileno JOURNAL) { my $tag = shift; my $lines = join("", @_); $lines =~ s/^/$tag:\t/gm; print JOURNAL $lines, "\n"; } } sub dbg { if ($main::debug != 0) { my $msg = "debug: ".join ('', @_); chomp $msg; &log ($msg); print STDERR "debug: ",@_,"\n"; } } sub sitewarn { &sitewarn_file_line ($sitewarn_current_site_line, @_); } sub sitewarn_file_line { my $fname = shift; $fname =~ s,^.*[\/\\:]([^\/\\:]+?):\d+?$,$1,o; warn "Site \"$fname\": ".join('', @_)."\n"; } sub verbose { if ($main::verbose) { my $msg = join ('', @_); chomp $msg; &log ($msg); print STDERR @_,"\n"; } } sub cleanexit { $SIG{__WARN__} = ''; $SIG{__DIE__} = ''; exit @_; } sub AbsoluteURL { local ($baseurl, $_) = @_; s/^"//; s/"$//; # trim quotes if necessary s/^%22//; s/%22.*?$//; # trim escaped quotes (!!) s/#.*$//g; # we can't get bits of docs (yet) TODO if (/^[^\/]+:/) { if (!/^(http|file):/) { # non-HTTP urls get ignored; don't get URI::URL involved, it'll crash return $_; } } use URI::URL; my $url = new URI::URL ($_, $baseurl); $url->abs->as_string; } sub AddHostToURL { # a simpler form of AbsoluteURL, used for StoryURL lines. # this is necessary because the real thing will escape metacharacters # which screws up regexp patterns. local ($baseurl, $_) = @_; s/^"//; s/"$//; # trim quotes if necessary s/#.*$//g; # we can't get bits of docs (yet) $_ = &expand_url_magic ($_); # allow [[MM]] etc. keywords in these patterns if (m,^[^/]+://,) { # do nothing, it's fully-qualified } elsif (m,^/,) { $baseurl =~ m,^([^/]+://[^/]+)/, and ($_ = $1.$_); } $_; } sub mm_to_monthname { my @months = qw(x Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); $months[$_[0]]; } sub get_date { my $time = shift; my ($x, $mday, $mon, $year); ($x,$x,$x,$mday,$mon,$year,$x,$x,$x) = localtime(defined $time ? $time : time); $mon++; $year += 1900; ($mday, $mon, $year, &mm_to_monthname($mon)); } sub time2datestr { my $time = shift; my ($dd, $mm, $year, $mon) = &get_date ($time); "$mon $dd $year"; } sub match_url { my $url = shift; my $pat = &expand_url_magic (shift); ($url =~ m#^${pat}$#); } sub expand_url_magic { local ($_); my $url = shift; my ($match_url_dd, $match_url_mm, $match_url_yyyy, $match_url_Mstr); if (!defined $match_url_yyyy) { ($match_url_dd, $match_url_mm, $match_url_yyyy, $match_url_Mstr) = &get_date; $match_url_yy = $match_url_yyyy; $match_url_yy =~ s/^\d\d//; # trim century $match_url_mm = "0$match_url_mm" unless ($match_url_mm =~ /^..$/); $match_url_dd = "0$match_url_dd" unless ($match_url_dd =~ /^..$/); } $url =~ s/\[\[YYYY\]\]/${match_url_yyyy}/g; $url =~ s/\[\[YY\]\]/${match_url_yy}/g; $url =~ s{\[\[MM([\+\-]\d+|)\]\]}{ &offset_month($match_url_mm, $1); }ge; $url =~ s{\[\[Mon([\+\-]\d+|)\]\]}{ &mm_to_monthname (&offset_month($match_url_mm, $1)); }ge; $url =~ s{\[\[mon([\+\-]\d+|)\]\]}{ $_ = &mm_to_monthname (&offset_month($match_url_mm, $1)); tr/A-Z/a-z/; $_; }ge; $url =~ s{\[\[MON([\+\-]\d+|)\]\]}{ $_ = &mm_to_monthname (&offset_month($match_url_mm, $1)); tr/a-z/A-Z/; $_; }ge; $url =~ s/\[\[DD\]\]/${match_url_dd}/g; $url; } sub offset_month { my $mm = shift; my $offset = shift; if ($offset ne '') { $mm += $offset; } if ($mm < 1 || $mm > 12) { $mm = ((($mm-1)+12) % 12)+1; } $mm = "0$mm" unless ($mm =~ /^..$/); $mm; } sub writing_doc { ($main::outstyle == $main::OUT_DOC); } sub writing_html { ($main::outstyle == $main::OUT_HTML); } sub writing_text { ($main::outstyle == $main::OUT_TEXT); } #=========================================================================== package Portability; sub MyOS { if (defined ($Portability::MY_OS)) { return $Portability::MY_OS; } # FIGURE OUT THE OS WE'RE RUNNING UNDER # Some systems support the $^O variable. If not available then require() # the Config library. [nicked from CGI.pm -- jmason] my $os; unless ($os) { unless ($os = $^O) { require Config; $os = $Config::Config{'osname'}; } } if ($os=~/win/i) { $os = 'Win32'; } elsif ($os=~/vms/i) { $os = 'VMS'; } elsif ($os=~/mac/i) { $os = 'Mac'; } elsif ($os=~/os2/i) { $os = 'OS2'; } else { $os = 'UNIX'; } $Portability::MY_OS = $os; } 1; #--------------------------------------------------------------------------- package ScoopHTTP::UserAgent; use LWP::UserAgent; BEGIN { @ISA = qw(LWP::UserAgent); @ScoopHTTP::UserAgent::PasswdMask = unpack ("c*", "Ish0ulDReallY#BeDoING|05th1S>wiTh". "5omEThInG+STr0NgeR1kNoW}iKNOw!~"); } sub new { my($class) = @_; my $self = new LWP::UserAgent; $self = bless $self, $class; $self; } sub get_basic_credentials { my ($self, $realm, $uri, $proxy) = @_; if (defined $site_logins{$realm} && defined $site_passes{$realm}) { &main::verbose ("(using already-set password for $uri $realm)"); } else { warn ("Need a password to access $uri $realm.\n"); if ($main::cgimode || !-t) { return undef; } print STDERR ("Username: "); my $user = <STDIN>; chop $user; print STDERR ("Password: "); (&Portability::MyOS eq 'UNIX') and system ("stty -echo"); my $pass = <STDIN>; chop $pass; (&Portability::MyOS eq 'UNIX') and system ("stty echo"); print STDERR "\n"; $site_logins{$realm} = $user; $site_passes{$realm} = $pass; } ($site_logins{$realm}, $site_passes{$realm}); } sub load_logins { if (defined %site_logins) { return %site_logins; } %site_logins = (); %site_passes = (); open (IN, '< '.$main::user_tmpdir.$main::slash.'site_logins') or return undef; #$site_logins{'tst'} = $site_passes{'tst'} = "jmason"; &save_logins; while (<IN>) { s/[\r\n]+$//g; my ($ver, $user, $pass, $realm) = split (/###/); if (defined $realm && $ver+0 == 0) { $site_logins{$realm} = $user; my @mask = @ScoopHTTP::UserAgent::PasswdMask; my @input = split (' ', $pass); my $pass_open = ''; my $i = 0; foreach $_ (@input) { my $ch = (($_ ^ $mask[$i++ % $#mask]) ^ 0xaa); last if ($ch == 0); $pass_open .= sprintf ("%c", $ch); } $site_passes{$realm} = $pass_open; } } close IN; #print "[", $site_logins{'tst'}, "][", $site_passes{'tst'}, "]\n"; exit; } sub save_logins { if (!defined %site_logins) { return; } my $towrite = ''; foreach $realm (sort keys %site_logins) { my @mask = @ScoopHTTP::UserAgent::PasswdMask; my @input = (unpack ("c*", $site_passes{$realm})); my $pass_disguised = ''; my $i = 0; foreach $_ (@input) { $pass_disguised .= (($_ ^ 0xaa) ^ $mask[$i++ % $#mask]) . " "; } while ($i < int(($#input / 16) + 1) * 16) { $pass_disguised .= ((0 ^ 0xaa) ^ $mask[$i++ % $#mask]) . " "; } chop $pass_disguised; $towrite .= "0###". $site_logins{$realm}. "###". $pass_disguised. "###". $realm. "\n"; } # again, all at once to minimise contention open (OUT, '> '.$main::user_tmpdir.$main::slash.'site_logins') or (warn ("failed to write to site_logins file!\n"), return); print OUT $towrite; close OUT or warn ("failed to write to site_logins file!\n"); } 1; #--------------------------------------------------------------------------- # #CGI package ScoopCGI; #CGI #CGI $cgi_cookie = undef; #CGI #CGI sub set_cookie { #CGI my ($userid) = @_; #CGI $cgi_cookie = $main::cgi->cookie(-name=>'sitescooper', -value=>"$userid"); #CGI print $main::cgi->header(-cookie=>$cgi_cookie); #CGI } #CGI #CGI sub get_cookie { #CGI my $cookie = $main::cgi->cookie('sitescooper'); #CGI return unless defined ($cookie); #CGI #CGI my ($uid, $x) = split ('#', $cookie); #CGI ($uid =~ /(\d+)/) and ($main::userid = $1); #CGI } #CGI #CGI sub print_input_form { #CGI # REVISIT #CGI } #CGI #CGI sub print_results_links { #CGI # REVISIT #CGI } #CGI #CGI sub get_prc_file { #CGI # REVISIT #CGI } 1; # TODO: # # incorporate support for mjd's diff implementation # bundle LWP modules in sitescooper.tar.gz # allow more flexibility for filenames: by day-of-week, with date at end # finish CGI support # CGI: finish cookie userid support -- passwords # #--------------------------------------------------------------------------- # vim:sw=2:tw=74: