#!/usr/bin/perl -w
#
# sitescooper - download news from web sites and convert it automatically
#	 into one of several formats suitable for viewing on a Palm
#	 handheld.
#
# Skip down to read the POD documentation.
#
# To set up, search for "CUSTOMISE" -- note UK/Irish spelling. ;)
# Change the setting appropriately and uncomment it, if required.
# Then move the required sites from the "sites_off" directory into the
# "sites" directory, and those will be downloaded automatically next
# time you run the script.

$main::VERSION = "1.8";

$CONFIG = '
# NOTE: on Windows, you will need to use 2 backslashes in any paths in
# this built-in configuration file, to avoid Perl interpreting them,
# like so: C:\\TMP


#######################################################################

# OPTIONAL SITE-DEPENDENT CONFIGURATION

# NOTE: If you will be converting sites into Pilot-readable format, you
# may need to specify this! The directory under your PalmPilot Desktop
# dir where installable PRC files need to go.
#
# On UNIX platforms using pilot-xfer, you should set up a directory where
# installed files are to go before you run "pilot-xfer -i".
#
# On a Win32 machine with only one Pilot user, this is determined
# automatically from the registry, so you will not need to set it.
# Otherwise, on Win32 platforms this is generally of the format
# {pilotdir}/{username}/Install, where {pilotdir} is the PalmPilot Desktop
# directory, and {username} is the abbreviation of the Pilot user name.
#
# MacOS users: you do need to customise this, a default will be used
# (although nothing will be written there). You need to run the conversion
# command yourself afterwards :(

# PilotInstallDir: $HOME/pilot/install		# CUSTOMISE

#######################################################################

# Sites directory, where the site description files are stored.
# By default, a directory called "sites" under your current directory,
# or under your $HOME on UNIX, is used if it exists.

# SitesDir: $HOME/lib/sites			# CUSTOMISE

#######################################################################

# Temporary directory to use for sitescooper. A subdirectory will be
# created called sitescooper{uid} where {uid} is your user id (or
# 0 on Win32 platforms). On UNIX platforms, this defaults to a hidden
# directory under your home dir, for privacy.

# TmpDir: /tmp					# CUSTOMISE

#######################################################################

# Specify the HTTP proxy server in use at your site, if applicable.

# ProxyHost: proxy.clubi.ie			# CUSTOMISE
# ProxyPort: 80					# CUSTOMISE

#######################################################################

# Diff will be searched for on the path if this is not specified here.

# Diff: C:\\path\\to\\diff.exe			# CUSTOMISE

#######################################################################

# The MakeDoc tool will be searched for on the path if it is
# not specified here. Default values are "makedoc" on UNIX platforms,
# or "makedocw.exe" on Windows.

# MakeDoc: makedocw.exe				# CUSTOMISE

# The iSilo conversion tool will be searched for on the path if it is
# not specified here. Default values are "iSilo386" for UNIX platforms,
# or "iSiloW32.exe" on Windows.

# iSilo: iSiloW32.exe				# CUSTOMISE

#######################################################################

# Where you want the text-format output to be saved. If commented,
# it will be saved under the sitescooper temporary directory.
# Note: MakeDocW on Windows does not like reading text from a
# directory with spaces in it, such as a directory under Program Files.
# In this case changing this parameter may help.

# TextSaveDir: C:\\where\\I\\want\\News		# CUSTOMISE

#######################################################################

# If you want to share a cache between multiple users or multiple
# configurations of sitescooper, uncomment this. It will allow a
# shared cache to be used.

# SharedCacheDir: /home/jm/lib/scoop_cache	# CUSTOMISE

#######################################################################

# How long should cached files be kept before expiring? Specified in
# days, and fractions are OK.

# ExpireCacheAfter: 7.0				# CUSTOMISE
';

#---------------------------------------------------------------------------

=head1 NAME

sitescooper - download news from web sites and convert it automatically
into one of several formats suitable for viewing on a Palm handheld.

=head1 SYNOPSIS

sitescooper [options] [-site sitename]

sitescooper [options] [-levels n] [-storyurl regexp] url [...]

Options: [-debug] [-refresh] [-config file] [-limit numkbytes]
[-install dir] [-dump] [-dumpprc] [-nowrite] [-nodates]
[-quiet] [-admin cmd] [-nolinkrewrite] [-stdout-to file]
[-keep-tmps]
[-text | -html | -mhtml | -doc | -isilo | -misilo | -richreader
| -pipe fmt command]

=head1 DESCRIPTION

This script, in conjunction with its configuration file and its set of
B<site> files, will download news stories from several top news sites into
text format and/or onto your Palm handheld (with the aid of the
B<makedoc>/B<MakeDocW> or B<iSilo> utilities).

Alternatively URLs can be supplied on the command line, in which case
those URLs will be downloaded and converted using a reasonable set of
default settings.

HTTP and local files, using the C<file:///> protocol, are both supported.

Multiple types of sites are supported:

=over 4

1-level sites, where the text to be converted is all present on one page
(such as Slashdot, Linux Weekly News, BluesNews, NTKnow, Ars Technica);

2-level sites, where the text to be converted is linked to from a Table
of Contents page (such as Wired News, BBC News, and I, Cringely);

3-level sites, where the text to be converted is linked to from a Table
of Contents page, which in turned is linked to from a list of issues
page (such as PalmPower).

=back

In addition sites that post news as items on one big page, such as
Slashdot, Ars Technica, and BluesNews, are supported using diff.

Note that at this moment in time, the URLs-on-the-command-line invocation
format does not support 2- or 3-level sites.

The script is portable to most UNIX variants that support perl, as well
as the Win32 platform (tested with ActivePerl 5.00502 build 509).

Currently the configuration is stored as a string inside the script
itself, but an alternative configuration file can be specified with the
B<-config> switch.

The sites downloaded will be the ones listed in the site files you keep in
your F<sites> directory.

sitescooper maintains a cache in its temporary directory; files are kept
in this cache for a week at most. Ditto for the text output directory
(set with B<TextSaveDir> in the built-in configuration).

If a password is required for the site, and the current sitescooper session
is interactive, the user will be prompted for the username and password.
This authentication token will be saved for later use.  This way a site
that requires login can be set up as a .site -- just log in once, and your
password is saved for future non-interactive runs.

Note however that the encryption used to hide the password in the
sitescooper configuration is pretty transparent; I recommend that rather
than using your own username and password to log in to passworded sites, a
dedicated, sitescooper account is used instead.

=head1 OPTIONS

=over 4

=item -refresh

Refresh all links -- ignore the F<already_seen> file, do not diff pages,
and always fetch links, even if they are available in the cache.

=item -config file

Read the configuration from B<file> instead of using the built-in one.

=item -limit numkbytes

Set the limit for output file size to B<numkbytes> kilobytes, instead of
the default 200K.

=item -install dir

The directory to save PRC files to once they've been converted, in order
to have them installed to your Palm handheld.

=item -site sitename

Limit the run to the site named in the B<sitename> argument.  Normally all
available sites will be downloaded. To limit the run to 2 or more sites,
provide multiple B<-site> arguments like so:

	-site ntk.site -site tbtf.site

=item -levels n

When specifying a URL on the command-line, this indicates how many levels
a site has. Not needed when using .site files.

=item -storyurl regexp

When specifying a URL on the command-line, this indicates the regular
expression which links to stories should conform to. Not needed when using
.site files.

=item -doc

Convert the page(s) downloaded into DOC format, with all the articles
listed in full, one after the other.

=item -text

Convert the page(s) downloaded into plain text format, with all the
articles listed in full, one after the other.

=item -html

Convert the page(s) downloaded into HTML format, on one big page, with
a table of contents (taken from the site if possible), followed by all
the articles one after another.

=item -mhtml

Convert the page(s) downloaded into HTML format, but retain the
multiple-page format. This will create the output in a directory
called B<site_name.pages>; in conjunction with the B<-dump> argument,
it will output the path of this directory on standard output before
exiting.

=item -isilo

Convert the page(s) downloaded into iSilo format (see
http://www.isilo.com/ ), on one big page.  This is the default.  The
page(s) will be displayed with a table of contents (taken from the site if
possible), followed by all the articles one after another.

=item -misilo

Convert the page(s) downloaded into iSilo format (see
http://www.isilo.com/ ), with one iSilo document per site, with each story
on a separate page.  The iSilo document will have a table-of-contents
page, taken from the site if possible, with each article on a separate
page.

=item -richreader

Convert the page(s) downloaded into RichReader format using HTML2Doc.exe
(see http://users.erols.com/arenakm/palm/RichReader.html ).  The page(s)
will be displayed with a table of contents (taken from the site if
possible), followed by all the articles one after another.

=item -pipe fmt command

Convert the page(s) downloaded into an arbitrary format, using the command
provided. Sitescooper will still rewrite the page(s) according to the
B<fmt> argument, which should be one of:

=over 4

=item text

Plain text format.

=item html

HTML in one big page.

=item mhtml

HTML in multiple pages.

=back

The command argument can contain C<__SCOOPFILE__>, which will be replaced
with the filename of the file containing the rewritten pages in the above
format, C<__SYNCFILE__>, which will be replaced with a suitable filename
in the Palm synchronization folder, and C<__TITLE__>, which will be
replaced by the title of the file (generally a string containing the date
and site name).

Note that for the B<-mhtml> switch, C<__SCOOPFILE__> will be replaced
with the name of the file containing the table-of-contents page. It's up
to the conversion utility to follow the href links to the other files
in that directory.

=item -dump

Output the page(s) downloaded directly to stdout in text or HTML format,
instead of writing them to files and converting each one. This option
implies B<-text>; to dump HTML, use B<-dump -html>.

=item -dumpprc

Output the page(s) downloaded directly to stdout, in converted format as a
PRC file, suitable for installation to a Palm handheld.

=item -nodates

Do not put the date in the installable file's filename. This allows you to
automatically overwrite old files with new ones when you HotSync.

=item -nowrite

Test mode -- do not write to the cache or already_seen file, instead write
what would be written normally to a directory called new_cache and a
new_already_seen file. This is very handy when writing a new site file.

=item -debug

Enable debugging output. This output is in addition to the usual progress
messages.

=item -quiet

Process sites quietly, without printing the usual progress messages to
STDERR. Warnings about incorrect site files and system errors will still
be output, however.

=item -admin cmd

Perform an administrative command. This is intended to ease the task of
writing scripts which use sitescooper output.  The following admin
commands are available:

=over 4

=item dump-sites

List the sites which would be scooped on a scooping run, and their URLs.
Instead of scooping any sites, sitescooper will exit after performing this
task.  The format is one site per line, with the site file name first, a
tab, the site's URL, a tab, the site name, a tab, and the output filename
that would be generated without path or extension. For example:

S<foobar.site	http://www.foobar.com/	Foo Bar	1999_01_01_Foo_Bar>

=item journal

Write a journal with dumps of the documents as they pass through the
formatting and stripping steps of the scooping process. This is
written to a file called B<journal> in the sitescooper temporary
directory.

=back

=item -nolinkrewrite

Do not rewrite links on scooped documents -- leave them exactly as they
are.

=item -stdout-to file

Redirect the output of sitescooper into the named file. This is needed
on Windows NT and 95, where perl does not seem to support the &gt;
operator.

=item -keep-tmps

Keep temporary files after conversion. Normally the .txt or .html
rendition of a site is deleted after conversion; this option keeps
it around.

=back

=head1 INSTALLATION

To install, edit the script and change the #! line. You may also need to
(a) change the Pilot install dir if you plan to use the pilot installation
functionality, and (b) edit the other parameters marked with CUSTOMISE in
case they need to be customised for your site. They should be set to
acceptable defaults (unless I forgot to comment out the proxy server lines
I use ;).

=head1 EXAMPLES

	sitescooper.pl http://www.ntk.net/

To snarf the ever-cutting NTKnow newsletter ("nasty, British and short").

=head1 ENVIRONMENT

B<sitescooper> makes use of the C<$http_proxy> environment variable, if it
is set.

=head1 AUTHOR

Justin Mason E<lt>justin_mason /at/ bigfoot.comE<gt>

=head1 COPYRIGHT

Some of the post-processing and HTML cleanup code include ideas and code
shamelessly stolen from http://pilot.screwdriver.net/ , Christopher
Heschong's <chris at screwdriver.net> webpage-to-pilot conversion tool,
which I discovered after writing a fully-working version of this script!
Looks like I reinvented the wheel again on this one ;)

Eh, anyway, the remainder of the code is copyright Justin Mason 1998-1999,
and is free software and as such is redistributable and modifiable under
the same terms as Perl itself. Justin can be reached at <justin_mason /at/
bigfoot.com>.

=head1 SCRIPT CATEGORIES

The CPAN script category for this script is C<Web>. See
http://www.cpan.org/scripts/ .

=head1 PREREQUISITES

	C<File::Find>
	C<File::Path>
	C<URI::URL>
	C<LWP::UserAgent>
	C<HTTP::Request::Common>
	C<HTTP::Date>

All these can be picked up from CPAN at http://www.cpan.org/ .

=head1 COREQUISITES

C<Win32::TieRegistry>, if running on a Win32 platform, to find the Pilot
Desktop software's installation directory.

=head1 README

Sitescooper downloads news stories from the web and converts them to Palm
handheld iSilo, DOC or text format for later reading on-the-move.  Site
files and full documentation can be found at
http://sitescooper.tsx.org/ .

=cut

#---------------------------------------------------------------------------

sub usage { die <<__ENDOFUSAGE;

Sitescooper - downloads news stories from the web and converts them to
	Palm handheld iSilo, DOC or text format for later
	reading on-the-move.

sitescooper [options] [-site sitename]

sitescooper [options] [-levels n] [-storyurl regexp] url [...]

Options: [-debug] [-refresh] [-config file] [-limit numkbytes]
	[-install dir] [-dump] [-dumpprc] [-nowrite] [-nodates]
	[-quiet] [-admin cmd] [-nolinkrewrite] [-stdout-to file]
	[-keep-tmps]
	[-text | -html | -mhtml | -doc | -isilo | -misilo | -richreader
	| -pipe fmt command]

Both file:/// and http:/// URLs are supported.

Version: $main::VERSION
__ENDOFUSAGE
					    }

#---------------------------------------------------------------------------

# use Carp;
# use strict;	# ah shaggit, life's too short for strict
use File::Find;
use File::Path;

use LWP::UserAgent;
use URI::URL;
use HTTP::Date;
use HTTP::Request::Common;

#CGI use CGI;

if (&Portability::MyOS eq 'Win32') {
  eval 'use Win32::TieRegistry( Delimiter=>"/", ArrayValues=>0 );';
}

$SIG{__WARN__} = 'warn_log';
$SIG{__DIE__} = 'die_log';

$OUT_TEXT = 0;
$OUT_DOC = 1;
$OUT_HTML = 2;

$main::home_url = "http://sitescooper.tsx.org";
$main::refresh = 0;
#$main::just_caching = 0;
$main::cached_front_page_lifetime = 60;	# in minutes
$main::dump = 0;
$main::dumpprc = 0;
$main::debug = 0;
$main::debugdiffs = 0;			# set to 1 to break after diffing
$main::verbose = 1;
$main::nowrite = 0;
$main::nodates = 0;
$main::bookmark_char = "\x8D";		# yes, same as Chris' one, cheers!
undef $main::pilotinstdir;
$main::use_convert_tool = 0;
$main::cgimode = 0;
$main::outstyle = $OUT_HTML;
$main::outputfilter = 'isilo';
$main::fileperpage = 0;
$main::nolinkrewrite = 0;
$main::filesizelimit = 200;		# limit of Kb per file (uncompressed)
$main::use_only_cache = 0;
#CGI $main::cgi = undef;
$main::admincmd = undef;
@main::sites_wanted = ();
@main::cmdline_urls = ();
%main::caches_to_rename = ();
$main::keep_tmps = 0;

$main::argv_levels = undef;
$main::argv_storyurl = undef;
$main::expiry_days = 7.0;

$main::useragent = new ScoopHTTP::UserAgent;
$main::useragent->env_proxy;
$main::useragent->agent ("sitescooper/$main::VERSION ($main::home_url) ".
		$main::useragent->agent);

&clear_page_tmpfiles;

# --------------------------------------------------------------------------

if (defined $ENV{'REQUEST_METHOD'}) {
  # we're running from a CGI script, use CGI mode
  #$main::cgimode = 1;
  #$main::cgi = new CGI;
}

# This is the placeholder for development debug flags.
# Add debugging stuff here, tagged with J M D (without the spaces ;).


# --------------------------------------------------------------------------

if ($main::cgimode == 0) {
  while ($#ARGV >= 0) {
    $_ = shift;

    if (/^-debug$/) {
      $main::debug = 1;

    } elsif (/^-quiet$/) {
      $main::verbose = 0;

    } elsif (/^-refresh/) {
      $main::cached_front_page_lifetime = 0;
      $main::refresh = 1;

    #} elsif (/^-cache/) {
      #$main::just_caching = 1;	# used for future parallelism

    } elsif (/^-dump/) {
      $main::dump = 1;
      $main::outstyle = $OUT_TEXT;

    } elsif (/^-dumpprc/) {
      $main::dumpprc = 1;

    } elsif (/^-doc/) {
      $main::outstyle = $OUT_DOC;
      $main::fileperpage = 0;
      $main::outputfilter = 'makedoc';

    } elsif (/^-isilo/) {
      $main::outstyle = $OUT_HTML;
      $main::fileperpage = 0;
      $main::outputfilter = 'isilo';

    } elsif (/^-misilo/) {
      $main::outstyle = $OUT_HTML;
      $main::fileperpage = 1;
      $main::outputfilter = 'isilo';

    } elsif (/^-richreader/) {
      $main::outstyle = $OUT_HTML;
      $main::fileperpage = 0;
      $main::outputfilter = 'richreader';

    } elsif (/^-text/) {
      $main::outstyle = $OUT_TEXT;
      $main::fileperpage = 0;
      $main::outputfilter = '__cat__';

    } elsif (/^-html/) {
      $main::outstyle = $OUT_HTML;
      $main::fileperpage = 0;
      $main::outputfilter = '__cat__';

    } elsif (/^-mhtml/) {
      $main::outstyle = $OUT_HTML;
      $main::fileperpage = 1;

    } elsif (/^-pipe/) {
      my $fmt = shift;
      my $cmd = shift;

      if ($fmt eq 'text') {
	$main::outstyle = $OUT_TEXT;
	$main::fileperpage = 0;

      } elsif ($fmt eq 'html') {
	$main::outstyle = $OUT_HTML;
	$main::fileperpage = 0;

      } elsif ($fmt eq 'mhtml') {
	$main::outstyle = $OUT_HTML;
	$main::fileperpage = 1;
      } else {
	&usage;
      }
      $main::outputfilter = 'cmd: '.$cmd;

    } elsif (/^-admin$/) {
      $main::admincmd = shift;

    } elsif (/^-nolinkrewrite/) {
      $main::nolinkrewrite = 1;
    } elsif (/^-fromcache/) {
      $main::use_only_cache = 1;
    } elsif (/^-limit/) {
      $main::filesizelimit = shift(@ARGV)+0;
    } elsif (/^-nodates/) {
      $main::nodates = 1;
    } elsif (/^-nowrite/) {
      $main::nowrite = 1;
    } elsif (/^-config/) {
      $config = shift;
    } elsif (/^-install/) {
      $pilotinstdir = shift;
    } elsif (/^-site/) {
      push (@sites_wanted, shift);
    } elsif (/^-levels/) {
      $argv_levels = shift()+0;
    } elsif (/^-storyurl/) {
      $argv_storyurl = shift;
    } elsif (/^-keep-tmps/) {
      $main::keep_tmps = 1;
    } elsif (/^-stdout-to/) {
      $_ = shift; close (STDOUT);
      open (STDOUT, ">> ".$_) or die "failed to redirect STDOUT to $_\n";

    } elsif (/^-/) {
      &usage;
    } else {
      unshift @ARGV, $_; last;
    }
  }
  @main::cmdline_urls = @ARGV;
  $main::userid = $<;

} else {
  # load some things from CGI parameters
  #CGI@main::cmdline_urls = ($main::cgi->param ('url'));
  #CGI$main::argv_levels = $main::cgi->param ('levels');
  #CGI$main::argv_storyurl = $main::cgi->param ('storyurl');

  #CGI@main::sites_wanted = $main::cgi->param ('sites');

  #CGI$main::debug = $main::cgi->param ('debug');
  #CGI$main::outstyle = $main::cgi->param ('outstyle');
  #CGI$main::nowrite = $main::cgi->param ('nowrite');
  #CGI$main::refresh = $main::cgi->param ('refresh');
  #CGI$main::userid = $main::cgi->param ('userid');
  #CGI&ScoopCGI::get_cookie;
  #CGI $main::password = $main::cgi->param ('password');
  # REVISIT -- use a cookie to store userid and password

  #CGI$main::pilotinstdir = undef;
}

@conflines = ();
if (defined $config) {
  open (IN, "< $config") || die "cannot read $config\n";
  @conf = (<IN>); close IN;
  for ($i=0; $i<$#conf; $i++) { $conflines[$i] = "$config:".($i+1); }
} else {
  @conf = split(/\n/, $CONFIG);
  for ($i=0; $i<$#conf; $i++) { $conflines[$i] = "(built-in):".($i+1); }
}

if ($main::debugdiffs) {
  &main::dbg ("debugging, will exit after diff");
}

# --------------------------------------------------------------------------

# Andrew Fletcher <fletch@computer.org>:
# A relative path on Mac seems to need a ":" before it. I've called
# this $colon.
$colon = '';
$slash = '/';

if (&Portability::MyOS eq 'Win32') { $slash = '\\'; }
if (&Portability::MyOS eq 'Mac') { $slash = ':'; $colon = ':'; }

$outdir = '';
%site_format = ();
%links_start = %links_end = ();
%links_limit_to = %story_limit_to = ();
%links_print = ();
%story_skip = %links_skip = ();
%story_diff = %links_diff = ();
%links_follow_links = %story_follow_links = ();
%story_lifetime = ();
%story_postproc = ();
%url_postproc = ();
%cacheable = ();	# 0 = static, 1 = dynamic, undef = use heuristics
%printable_sub = ();
%head_pat = ();
%levels = ();
%use_table_smarts = ();
%extra_urls = ();
@sites = ();
$url = '';
$sect = '';
$curkey = '';
$main::cached_front_page_lifetime /= (24*60);	# convert to days
%url_title = ();
$sharedcache = undef;

undef $tmpdir;
if (&Portability::MyOS eq 'UNIX') {
  $tmpdir = $ENV{'HOME'}."/.sitescooper";
}
$tmpdir ||= $ENV{'TMPDIR'}; $tmpdir ||= $ENV{'TEMP'};

$diff = 'diff';
if (&Portability::MyOS eq 'Win32') { $diff = "diff.exe"; }

$makedoc = 'makedoc';
if (&Portability::MyOS eq 'Win32') { $makedoc = "makedocw.exe"; }

$isilo = 'iSilo386'; $isiloargs = '-y'; $isilomultipageargs = '-d2';
if (&Portability::MyOS eq 'Win32')
	{ $isilo = "iSiloW32.exe"; $isiloargs = '-u -y'; }

$richreader = 'HTML2Doc'; $richargs = '';
if (&Portability::MyOS eq 'Win32')
	{ $richreader = "HTML2Doc.exe"; $richargs = '-i'; }
# Note that currently there is no HTML2Doc for UNIX platforms; it's
# supported here anyway for future-proofing.

$sitesdir = "sites";
if (!-d $sitesdir && &Portability::MyOS eq 'UNIX')
			{ $sitesdir = $ENV{'HOME'}."/sites"; }

# ---------------------------------------------------------------------------

sub set_got_intr_behaviour {
  $got_intr_behaviour = shift;
  $got_intr_flag = 0;
}

sub got_intr {
  my $signame = shift;
  (&Portability::MyOS eq 'UNIX') and system ("stty echo");

  if ($got_intr_behaviour eq 'exit') {
    die "got signal SIG$signame, exiting.\n";
  } else {
    die "got signal SIG$signame, skipping site...\n";
    $got_intr_flag = 1;
  }
}

&set_got_intr_behaviour ('exit');
$SIG{'INT'} = \&got_intr;
$SIG{'TERM'} = \&got_intr;

# ---------------------------------------------------------------------------

if (!defined $pilotinstdir && !$main::cgimode) {
  @main::possible_inst_dirs = ();
  my $dir;

  if (&Portability::MyOS eq 'Win32') {
    eval '
      sub get_instdir_wanted {
	return unless (/^install$/i && -d $File::Find::name);
	push (@main::possible_inst_dirs, $File::Find::name);
      }

      my $key = "HKEY_CURRENT_USER/Software/U.S. Robotics".
		    "/Pilot Desktop/Core//Path";
      if ($dir = $Registry->{$key}) {
	@main::possible_inst_dirs = ();
	find(\&get_instdir_wanted, $dir);
      }
    ';

  } elsif (defined $ENV{'HOME'}) {
    $dir = $ENV{'HOME'}."/pilot";
    if (-d "$dir/install") { @main::possible_inst_dirs = ("$dir/install"); }
    if (-d $dir) { @main::possible_inst_dirs = ($dir); }
  }

  if ($#main::possible_inst_dirs == 0) {
    $pilotinstdir = $main::possible_inst_dirs[0];

  } elsif ($#main::possible_inst_dirs > 0 && !&writing_text) {
    warn "Fatal: too many potential pilot PRC install directories, ".
	"please use '-install' argument.\n";
    foreach $dir (@main::possible_inst_dirs) {
      warn "Possible choice: $dir\n";
    }
    &cleanexit(1);
  }
}
&ReadSitesDir;

# ---------------------------------------------------------------------------

my $postproc = undef;
my $postproctype = undef;
my $proxyhost;
my $proxyport = 80;
my $confline;

foreach $_ (@conf) {
  $confline = shift @conflines;
  s/#.*$//; s/^\s+//; s/\s+$//g; next if (/^$/);

  # process environment variable references: ${ENVVARNAME}
  # &main::dbg ("variable ref in site file: $1");
  s/\$\{(\S+?)\}/
  	defined($ENV{$1}) ? $ENV{$1} : "";
  /ge;

  if (defined $postproctype) {
    $postproc .= $_;
    # see if it's the end of the postproc statement scope
    $x = $postproc; 1 while ($x =~ s/\{[^\{\}]*\}//gs);		#{
    if ($x =~ /\}\s*$/) {
      if ($postproctype eq 'Story') {				#{
	$postproc =~ /^(.*)\}\s*$/; $story_postproc{$curkey} = $1;
	$postproc = undef;
	$postproctype = undef;
      }
      if ($postproctype eq 'URL') {			#{
	$postproc =~ /^(.*)\}\s*$/; $url_postproc{$curkey} = $1;
	$postproc = undef;
	$postproctype = undef;
      }
    }
    next;
  }

  s/^(\S+:)\s+/$1 /;		# easier to read this way ;)
  /^ProxyHost: (.*)$/ and ($proxyhost = $1), next;
  /^ProxyPort: (.*)$/ and ($proxyport = $1+0), next;
  /^TmpDir: (.*)$/ and ($tmpdir = $1), next;

  if (/^SitesDir: (.*)$/) {
    $sitesdir = $1;
    &ReadSitesDir;
    next;
  }

  /^MakeDoc: (.*)$/ and ($makedoc = $1), next;
  /^iSilo: (.*)$/ and ($isilo = $1), next;
  /^HTML2Doc: (.*)$/ and ($richreader = $1), next;
  /^Diff: (.*)$/ and ($diff = $1), next;
  /^TextSaveDir: (.*)$/ and ($outdir = $1), next;
  /^PilotInstallDir: (.*)$/ and ($pilotinstdir = $1), next;
  /^SharedCacheDir: (.*)$/ and ($sharedcache = $1), next;
  /^ExpireCacheAfter: (.*)$/ and ($expiry_days = $1+0.0), next;

  if (/^URL: (.*)$/) {
    &FinishConfigSection ($sect, $url);
    $url = &expand_url_magic ($1); $sect = '';

    if ($url !~ m,^(http|file)://,i) { $url = 'http://'.$url; }
    if ($url =~ m,(http|file)://[^/]+$,i) { $url .= '/'; }
    push (@sites, $url);
    &SetDefaultConfigForURL ($url);
    $site_defined_at{$url} = $confline;
    $curkey = $url;
    next;
  }

  if (!defined $curkey || $curkey eq '') {
    my $line = $confline; $line =~ s/^(.*):(.*?)$/"$1" line $2/g;
    die "Configuration line invalid (outside URL scope?) in $line:\n  $_\n";
  }

  /^Name: (.*)$/ and ($name{$curkey} = $1), next;
  /^Description: (.*)$/ and ($desc{$curkey} = $1), next;
  /^Active: (.*)$/ and ($active{$curkey} = $1+0), next;
  /^Levels: (.*)$/ and ($levels{$curkey} = $1-2), next;
  /^AddURL: (.*)$/ and ($extra_urls{$curkey} .= ' '.&expand_url_magic($1)), next;
  /^UseTableSmarts: (.*)$/ and ($use_table_smarts{$curkey} = $1+0), next;

  /^IssueLinksStart: (.*)$/ and ($links_start{"1 $curkey"} = $1), next;
  /^IssueLinksEnd: (.*)$/     and ($links_end{"1 $curkey"} = $1), next;
  /^IssuePrint: (.*)$/      and ($links_print{"1 $curkey"} = $1+0), next;
  /^IssueCache?able: (.*)$/     and ($cacheable{"1 $curkey"} = $1+0), next;
  /^IssueDiff: (.*)$/        and ($links_diff{"1 $curkey"} = $1+0), next;
  /^IssueUseTableSmarts: (.*)$/ and ($use_table_smarts{"1 $curkey"} = $1+0), next;
  /^IssueFollowLinks: (.*)$/ and ($links_follow_links{"1 $curkey"} = $1+0), next;

  # Normally Issue-level stuff is the highest level, so this would seem to
  # be irrelevant as we never would have to decide whether a URL is the
  # issues page since it's provided in the site file. However the
  # IssueFollowLinks parameter provides a need for this.
  if (/^IssueURL: (.*)$/) {
    my $pat = $1;
    if (!defined ($links_limit_to{"1 $curkey"})) {
      $links_limit_to{"1 $curkey"} = "($pat)";
    } else {
      $links_limit_to{"1 $curkey"} =~ s/\)$/|$pat)/g;
    }
    next;
  }

  if (/^ContentsFormat: (.*)$/) {
    my $fmt = $1;
    if ($fmt eq 'rss') {
      # set up defaults for a Rich Site Summary site.
      # cf. http://my.netscape.com/publish/
      $site_format{$url} = 'rss';
      $links_start{"0 $curkey"} = '(<rdf:RDF|<rss version=|<scriptingNews)';
      $links_end{"0 $curkey"} = '(</rdf:RDF>|</rss>|</scriptingNews>)';
      $links_diff{"0 $curkey"} = 1;
      $levels{$url} = 0;

    } elsif ($fmt eq 'html') {
      # the default -- do nothing.

    } else {
      &sitewarn_file_line ($confline, "Unrecognised ContentsFormat: $_\n");
    }
    next;
  }

  /^ContentsStart: (.*)$/   and ($links_start{"0 $curkey"} = $1), next;
  /^ContentsEnd: (.*)$/       and ($links_end{"0 $curkey"} = $1), next;
  /^ContentsPrint: (.*)$/   and ($links_print{"0 $curkey"} = $1+0), next;
  /^ContentsCache?able: (.*)$/  and ($cacheable{"0 $curkey"} = $1+0), next;
  /^ContentsSkipURL: (.*)$/  and ($links_skip{"0 $curkey"} = $1), next;
  /^ContentsDiff: (.*)$/     and ($links_diff{"0 $curkey"} = $1+0), next;
  /^ContentsUseTableSmarts: (.*)$/ and ($use_table_smarts{"0 $curkey"} = $1+0), next;
  /^ContentsFollowLinks: (.*)$/	and ($links_follow_links{"0 $curkey"} = $1+0), next;

  if (/^ContentsURL: (.*)$/) {
    my $pat = &AddHostToURL ($curkey, $1);

    if (!defined ($links_limit_to{"0 $curkey"})) {
      $links_limit_to{"0 $curkey"} = "($pat)";
    } else {
      $links_limit_to{"0 $curkey"} =~ s/\)$/|$pat)/g;
    }
    next;
  }

  /^StoryStart: (.*)$/		and ($story_start{$curkey} = $1), next;
  /^StoryEnd: (.*)$/		and ($story_end{$curkey} = $1), next;
  /^StoryCache?able: (.*)$/	and ($cacheable{"s $curkey"} = $1+0), next;
  /^StoryDiff: (.*)$/		and ($story_diff{$curkey} = $1+0), next;
  /^StorySkipURL: (.*)$/	and ($story_skip{$curkey} = $1), next;
  /^StoryHeadline: (.*)$/	and ($head_pat{$curkey} = $1), next;
  /^StoryToPrintableSub: (.*)$/	and ($printable_sub{$curkey} = $1), next;
  /^StoryFollowLinks: (.*)$/	and ($story_follow_links{$curkey} = $1+0), next;
  /^StoryLifetime: (.*)$/	and ($story_lifetime{$curkey} = $1+0), next;

  if (/^StoryURL: (.*)$/) {
    my $pat = &AddHostToURL ($curkey, $1);

    if (!defined ($story_limit_to{$curkey})
      		|| $story_limit_to{$curkey} !~ /\)$/)
    {
      $story_limit_to{$curkey} = "($pat)";
    } else {
      $story_limit_to{$curkey} =~ s/\)$/|${pat})/g;
    }
    next;
  }

  if (/^(URL)Process: (.*)$/) {
    my $type = $1;
    my $val = $2;
    if ($val =~ s/^\{//) #}
    {
      $postproctype = $type;
      $postproc = $val;
    } else {
      if ($type eq 'URL') { $url_postproc{$curkey} = $val; }
    }
    next;
  }

  if (/^(Story)PostProcess: (.*)$/) {
    my $type = $1;
    my $val = $2;
    if ($val =~ s/^\{//) #}
    {
      $postproctype = $type;
      $postproc = $val;
    } else {
      if ($type eq 'Story') { $story_postproc{$curkey} = $val; }
    }
    next;
  }

  if (/^Section: (.*)$/) {
    &FinishConfigSection ($sect, $url);
    $sect = $1;

    if ($sect !~ m,^(http|file)://,i) {
      if ($sect !~ m,^/,i) {
	$sect = 'http://'.$sect;
      } else {
	$url =~ m,((http|file)://[^/]+)/,; $sect = $1.$sect;
      }
    }
    if ($sect =~ m,(http|file)://[^/]+$,) { $sect .= '/'; }
    $sect = &expand_url_magic ($sect);
    $sections{$url} .= "|||$sect";
    $levels{$sect} = $levels{$url};
    $active{$sect} = 1;
    $extra_urls{$sect} = '';
    $site_defined_at{$sect} = $confline;
    $curkey = $sect;
    next;
  }

  my $line = $confline; $line =~ s/^(.*):(.*?)$/"$1" line $2/g;
  &sitewarn_file_line ($confline, "Unrecognised config line in $line:\n  $_\n");
}

if (defined $postproctype) { &sitewarn_file_line ($confline, "Fell off end
of ${postproctype}PostProcess statement!\n"); }

&FinishConfigSection ($sect, $url);
undef @conf;
undef @conflines;

if (!defined $sitesdir) {
  warn "Warning: can't find the 'sites' directory, please specify it!\n";
}

if (defined $proxyhost) {
  $main::useragent->proxy
  	(['http', 'ftp'], "http://$proxyhost:$proxyport/");
}

# ---------------------------------------------------------------------------

# if ($just_caching) {
#   # just put the pages into the cache and forget about it
#   foreach $url (@main::cmdline_urls) {
#     &log ("bg: getting $url ...");
#     &get_page ($url, 0);
#   }
#   &log ("bg: done.");
#   &cleanexit;
# }

if ($#main::cmdline_urls > -1) {
  @sites = ();
  foreach $url (@main::cmdline_urls) {
    # if it's a local file URL, switch around the slashes (for windows)
    if (&Portability::MyOS eq 'Win32' && $url =~ m,file:///,i) {
      $url =~ s/\\/\//g;
    }
    # REVISIT -- I don't know what to do in the same case for MacOS ;)

    if (-r $url) {
      if ($url =~ m,^/,) {
	$url = 'file://'.$url;
      } else {
	eval '
	  use Cwd;
	  $url = "file://".getcwd."/".$url;
	1;' or die ("eval failed: $@");
      }
    }

    if ($url =~ m,(http|file)://[^/]+$,i) { $url .= '/'; }

    if (!defined $name{$url}) {
      $name{$url} = $url;
      if ($url =~ m,/([^/]+)$,) { $name{$url} = $1; }
    }
    $confline = "$url:0";

    push (@sites, $url);
    &SetDefaultConfigForURL ($url);
    $site_defined_at{$url} = $confline;
    $story_lifetime{$url} = 0;	# any age of story for command-line URLs

    if (defined $argv_levels) {
      $levels{$url} = $argv_levels-2;
    }
    if (defined $argv_storyurl) {
      $story_limit_to{$url} = $argv_storyurl;
    }
  }
}

# ---------------------------------------------------------------------------

($mday, $mon, $year, $monstr) = &get_date;
if (!$nodates) {
  $date_for_filename = sprintf ("%04d_%02d_%02d_", $year, $mon, $mday);
  $title = "$year-".$monstr."-$mday: ";
} else {
  $date_for_filename = '';
  $title = '';
}

%already_seen = ();
%last_modtime = ();
%main::oldest_already_seen = ();
@seen_this_time = ();
$main::failed_to_cvt = 0;
$main::use_convert_tool = (!&writing_text);
$main::warn_about_external_links = 0;	# turned on where necessary

&make_dirs;
&generate_output_filenames (@sites);

if (defined $main::admincmd) {
  if ($main::admincmd eq 'dump-sites') {
    while (($key,$outfile) = each %key2outfile) {
      my $url = $key2url{$key};
      my $title = $key2title{$key};
      $title =~ s,\t, ,g; $title =~ s,^\d+-\S+-\d+: ,,g;
      my $base = $key2tmp{$key}; $base =~ s,^.*${slash}(\S+?)\.tmp$,$1,o;
      my $site = $site_defined_at{$url};
      $site =~ s/:\d+$//; $site =~ s/^.*${slash}(\S+?)$/$1/o;

      # foobar.site	http://www.foobar.com/	Foo Bar	1999_01_01_Foo_Bar
      print "$site\t$url\t$title\t$base\n";
    }
    exit;

  } elsif ($main::admincmd eq 'journal') {
    open (JOURNAL, "> $tmpdir${slash}journal")
    	or die "cannot write to $tmpdir${slash}journal!\n";

  } else { &usage; }
}

if ($main::use_convert_tool) {
  if (!-d $pilotinstdir) {
    mkdir ($pilotinstdir, 0755) || die "failed to mkdir '$pilotinstdir'\n";
  }
}

&expire_old_cache_files;
&ScoopHTTP::UserAgent::load_logins;
&read_state;

# to do all the conversions at the end:
#&get_all_sites;
#foreach $filekey (@main::filekeys) { &convert_output($filekey); }

# to do them as each site is scooped:
&get_all_sites (1);

&write_state;
&ScoopHTTP::UserAgent::save_logins;

&verbose ("Finished!");
&cleanexit;

# ---------------------------------------------------------------------------

sub ReadSitesDir {
  my ($file, $key);
  my %sites_wanted = ();
  %read_sites = ();

  @site_files_to_read = ();
  if ($#sites_wanted >= 0) {
    &verbose ("Restricting to sites: ".join (' ', @sites_wanted));
    foreach $key (@sites_wanted) {
      if (-r $key) {
	# it's a site file, not a name, add it to the list
	push (@site_files_to_read, $key);
      }

      $sites_wanted{$key} = 1;
    }
  }
  if ($#main::cmdline_urls >= 0) {
    # we're only snarfing the command-line URLs, skip the predefined sites
    return;
  }

  if (defined $sitesdir) {
    foreach $file (<${colon}$sitesdir${slash}*.site>) {
      next if ($file =~ /(\.swp$|core|\.bak$|\~$|^#)/);	# skip backups
      next if (-d $file);		# skip directories

      if ($#sites_wanted >= 0) {
	my $base = $file; $base =~ s,^.*[\/\\:]([^\/\\:]+)$,$1,g;
	&main::dbg ("checking if site file is wanted: $file");
	next unless (defined $sites_wanted{$base} || defined $sites_wanted{$file});
      }
      push (@site_files_to_read, $file);
    }
  }

  foreach $file (@site_files_to_read) {
    next if (defined $read_sites{$file});
    $read_sites{$file} = 1;

    if (open (IN, "< $file")) {
      my $line = 0;
      while (<IN>) {
	push (@conf, $_); push (@conflines, "$file:$line");
	$line++;
      }
      close IN;
      &verbose ("Scooping site from file \"$file\".");

    } else {
      &sitewarn_file_line ("$file:0", "Cannot read $file\n");
    }
  }
}

# ---------------------------------------------------------------------------
# Default configuration for a newly-specified URL.

sub SetDefaultConfigForURL {
  my $url = shift;

  $sections{$url} = "";		# none yet
  $active{$url} = 1;		# active by default
  $use_table_smarts{$url} = 1;	# use smarts
  $levels{$url} = -1;		# 1-level site
  $extra_urls{$url} = '';	# no extra URLs
  $story_lifetime{$url} = 90;	# don't scoop stories older than 3 months

  # default limit to articles at the same site
  $url =~ m,^((http|file)://[^/]*/),i;
  if (defined $1) {
    $story_limit_to{$url} = $1.'.*';
  } else {
    &sitewarn_file_line ($confline,
    	"Unsupported URL protocol for URL '".$url."'.\n");
  }
}

# ---------------------------------------------------------------------------
# Incorporate defaults from the main URL into each Section.
#
sub FinishConfigSection {
  my $sect = shift;
  my $url = shift;

  if ($sect ne '') {
    if (!defined $name{$sect}) { $name{$sect} = $url; }
    if (!defined $desc{$sect}) { $desc{$sect} = $url; }
    if (!defined $story_start{$sect}) { $story_start{$sect} = $story_start{$url}; }
    if (!defined $story_end{$sect}) { $story_end{$sect} = $story_end{$url}; }
    if (!defined $head_pat{$sect}) { $head_pat{$sect} = $head_pat{$url}; }
    if (!defined $printable_sub{$sect})
		{ $printable_sub{$sect} = $printable_sub{$url}; }
    if (!defined $story_limit_to{$sect})
		{ $story_limit_to{$sect} = $story_limit_to{$url}; }
    if (!defined $story_skip{$sect}) { $story_skip{$sect} = $story_skip{$url}; }
    if (!defined $story_diff{$sect}) { $story_diff{$sect} = $story_diff{$url}; }
    if (!defined $story_follow_links{$sect})
    		{ $story_follow_links{$sect} = $story_follow_links{$url}; }
    if (!defined $story_lifetime{$sect})
    		{ $story_lifetime{$sect} = $story_lifetime{$url}; }
    if (!defined $active{$sect}) { $active{$sect} = $active{$url}; }

    # If the main site is disabled, so are the sub-sites.
    if ($active{$url} == 0) {
      $active{$sect} = 0;
    }

    $levels{$sect} = $levels{$url};
    for ($lev = $levels{$url}; $lev >= 0; $lev--)
    {
      if (!defined $links_start{"$lev $sect"}) {
	$links_start{"$lev $sect"} = $links_start{"$lev $url"};
      }
      if (!defined $links_end{"$lev $sect"}) {
	$links_end{"$lev $sect"} = $links_end{"$lev $url"};
      }
      if (!defined $links_skip{"$lev $sect"}) {
	$links_skip{"$lev $sect"} = $links_skip{"$lev $url"};
      }
      if (!defined $links_diff{"$lev $sect"}) {
	$links_diff{"$lev $sect"} = $links_diff{"$lev $url"};
      }
      if (!defined $links_print{"$lev $sect"}) {
	$links_print{"$lev $sect"} = $links_print{"$lev $url"};
      }
      if (!defined $links_follow_links{"$lev $sect"}) {
	$links_follow_links{"$lev $sect"} = $links_follow_links{"$lev $url"};
      }
      if (!defined $links_limit_to{"$lev $sect"}) {
	$links_limit_to{"$lev $sect"} = $links_limit_to{"$lev $url"};
      }
    }
  }
}

# ---------------------------------------------------------------------------

sub make_dirs {
  if (!-d $tmpdir) {
    mkdir ($tmpdir, 0777) || die "failed to mkdir '$tmpdir'\n";
  }
  chdir ($tmpdir) or die "cannot cd to $tmpdir\n";

  $user_tmpdir = "$tmpdir${slash}sitescooper_$userid";

# passwords for sitescooper caches are not fully impled right now!
#
#if ($main::cgimode) {
#open (PWD, "< $user_tmpdir${slash}passwd");
#my $pwd = <PWD>; close PWD;
#my $salt = substr($pwd, 0, 2);
#if (crypt ($main::password, $salt) ne $pwd) {
#&ScoopCGI::passwd_failed; exit;
#}
#}

  if (!-d $user_tmpdir) {
    mkdir ($user_tmpdir, 0777) || die "failed to mkdir '$user_tmpdir'\n";
  }

  if (!defined $pilotinstdir) {
    $pilotinstdir = "$user_tmpdir${slash}prc";
    if (!-d $pilotinstdir) {
      mkdir ($pilotinstdir, 0777) || die "failed to mkdir '$pilotinstdir'\n";
    }

    if (!$cgimode) {
      &verbose ("Warning: since no PilotInstallDir was specified".
      	" in the configuration,\nI\'ll use $pilotinstdir .\n");
    }
  }

  if ($main::debug) {
    open (LOGFILE, "> $user_tmpdir${slash}log.txt");
    select LOGFILE; $| = 1; select STDOUT;
  }

  if ($outdir eq '') { $outdir = "$user_tmpdir${slash}txt"; }
  if (!-d $outdir) {
    mkdir ($outdir, 0777) || die "failed to mkdir '$outdir'\n";
  }

  $cachedir = "$user_tmpdir${slash}cache"; $newcachedir = $cachedir;
  if (!-d $cachedir) {
    mkdir ($cachedir, 0777) || die "failed to mkdir '$cachedir'\n";
  }

  if (defined $sharedcache) {
    if (!-d $sharedcache) {
      mkdir ($sharedcache, 0777) || die "failed to mkdir '$sharedcache'\n";
    }
  }

  $alreadyseen = "$user_tmpdir${slash}already_seen.txt"; $newalreadyseen = $alreadyseen;

  if ($nowrite) {
    $newcachedir = "$user_tmpdir${slash}new_cache";
    if (!-d $newcachedir) {
      mkdir ($newcachedir, 0777) || die "failed to mkdir '$newcachedir'\n";
    }
    $newalreadyseen = "$user_tmpdir${slash}new_already_seen.txt";
  }

  # check for spaces on Win32 -- MakeDocW can't handle them!
  # Thx to wgoosey /at/ servtech.com for spotting this one.
  if ($main::outputfilter eq 'makedoc') {
    if (&Portability::MyOS eq 'Win32') {
      if ($outdir =~ / /) {
	warn "

Warning: Sitescooper is installed in a directory containing spaces in the
filename. The MakeDocW conversion tool does not support this, so you may
need to move Sitescooper to another directory, e.g. C:\\Sitescooper, for
this conversion to work!  (This is a bug in MakeDOCW.exe.)

";
      }
    }
  }
}

# ---------------------------------------------------------------------------

sub expire_old_cache_files {
  if (defined $pilotinstdir) {
    sub expire_prcdir { unlink if (-f $_ && -M $_ > $main::expiry_days); }
    find(\&expire_prcdir, $pilotinstdir);
  }

  if (defined $outdir) {
    if (!$main::use_convert_tool) {
      sub expire_outdir { unlink if (-f $_ && -M $_ > $main::expiry_days); }
      find(\&expire_outdir, $outdir);
    }
  }

  sub expire_cache { unlink if (-f $_ && -M $_ > $main::expiry_days); }
  find(\&expire_cache, $cachedir);

  if (defined $sharedcache) {
    sub expire_shared_cache { unlink if (-f $_ && -M $_ > $main::expiry_days); }
    find(\&expire_shared_cache, $sharedcache);
  }
}

# ---------------------------------------------------------------------------

sub read_state {
  if ($main::refresh == 0) {
    if (!open (IN, "< $alreadyseen")) {
      &verbose ("Cannot read $alreadyseen, creating a new one");

    } else {
      my $url;
      my $mod;
      my $urlhost;
      while (<IN>) {
	/ lastmod=/; $url = $`; $mod = $';
	$already_seen{$url} = 1;
	$last_modtime{$url} = $mod+0;

	m,http://(\S+?)/,; $urlhost = $1;
	if (defined($main::oldest_already_seen_this_site{$urlhost})
	    ? $main::oldest_already_seen_this_site{$urlhost} > $mod : 1)
	{
	  $main::oldest_already_seen_this_site{$urlhost} = $mod;
	}
      }
      close IN;
    }
  }
}

# ---------------------------------------------------------------------------

sub generate_output_filenames {
  my @sites = @_;
  my %already_done = ();

  foreach $site (@sites) {
    my @urls = ($site);
    if ($sections{$site} ne "") {
      @urls = split (/\|\|\|/, $sections{$site});
    }

    foreach $url (@urls) {
      next if ($url eq '');
      next unless ($active{$url} == 1);

      my $filekey = $site.$url;

      $sitename = $name{$site};
      $sectname = '';
      if ($site ne $url) { $sectname = "_".$name{$url}; }

      my $filedesc = $date_for_filename.$sitename.$sectname;
      $filedesc =~ s/[^-_A-Za-z0-9]+/_/g;
      $filedesc =~ s/_+$//; $filedesc =~ s/^_+//;

      if (&Portability::MyOS eq 'Mac') {
	# try to limit the filename to 32 characters
	$filedesc =~ s/^(.{32}).*$/$1/g;
      }

      $outidxfile = undef;
      if (&writing_html && $main::fileperpage) {
	$outfile = $outdir.$slash.$filedesc.'.pages';
	$outidxfile = $filedesc.'.html';
      } elsif (&writing_html) {
	$outfile = $outdir.$slash.$filedesc.'.html';
      } else {
	$outfile = $outdir.$slash.$filedesc.'.txt';
      }

      next if (defined $already_done{$outfile});
      $already_done{$outfile} = 1;

      $tmpfile = $outdir.$slash.$filedesc.'.tmp';

      $sectname =~ s/_+$//; $sectname =~ s/^_+//;
      my $secttitle = "${title}$sitename" .
			  ($sectname ne '' ? ": $sectname" : "");

      $main::key2tmp{$filekey} = $tmpfile;
      $main::key2outfile{$filekey} = $outfile;
      $main::key2outidxfile{$filekey} = $outidxfile;
      $main::key2title{$filekey} = $secttitle;
      $main::key2sitename{$filekey} = $sitename;
      $main::key2site{$filekey} = $site;
      $main::key2url{$filekey} = $url;
      push (@main::filekeys, $filekey);

      if ($main::dumpprc) {
	$main::key2syncfile{$filekey} = $tmpfile;	# reuse it!
      } else {
	$main::key2syncfile{$filekey} =
		    $pilotinstdir.$slash.$filedesc.'.prc';
      }
    }
  }
}

# ---------------------------------------------------------------------------

sub get_all_sites {
  my $convert_now = shift;
  if (!defined $convert_now) { $convert_now = 0; }

  foreach $filekey (@main::filekeys) {
    $outfile = $key2outfile{$filekey};
    $tmpfile = $key2tmp{$filekey};
    my $outidxfile = $key2outidxfile{$filekey};
    my $secttitle = $key2title{$filekey};
    my $sitename = $key2sitename{$filekey};
    my $site = $key2site{$filekey};
    my $url = $key2url{$filekey};

    # This apparently is needed on MacOS. Sounds unlikely, but there it
    # is...
    if (&Portability::MyOS eq 'Mac') {
      eval '
	use File::Basename;
	my $parentdir = dirname($outfile);
	if (!-d $parentdir) { mkdir ($parentdir, 0755); }
      1;' or die ("eval failed: $@");
    }

    &main::dbg ("tmp file: $tmpfile, out file: $outfile");
    (-d $tmpfile) && &rmtree ($tmpfile);
    if ($main::outstyle == $OUT_HTML && $main::fileperpage) {
      mkdir ($tmpfile, 0755) || die "cannot mkdir $tmpfile\n";
      &clear_page_tmpfiles;
      $main::output_filename = $tmpfile.$slash.$outidxfile;
    } else {
      $main::output_filename = $tmpfile;
    }

    $_ = $site_defined_at{$url}; /^(.*):/;
    &verbose ("Now scooping site \"$1\".");

    $main::output_file = '';
    %main::output_links_snarfed = ();
    %main::oldest_already_seen_this_site = ();
    $main::current_story_index = 0;
    @main::output_story_urls = ();

    if ($main::outstyle == $OUT_HTML) {
      $main::output_file .= "<html><head><title>$secttitle</title></head>".
	      "<body><h1>$secttitle</h1><br>\n";
    } else {
      $main::output_file .= "$secttitle\n\n\n";
    }

    $stories_found = 0;
    $file_size = 0;
    $hit_file_size_limit = 0;

    &set_got_intr_behaviour ('setflag');
    my $u;
    foreach $u ($url, split (' ', $extra_urls{$url})) {
      # if we were interrupted, clear the flag and go on
      if ($got_intr_flag) { &set_got_intr_behaviour ('setflag'); }

      if ($main::outstyle == $OUT_HTML && $main::fileperpage) {
	$page_to_tmpfile{$u} = $tmpfile.$slash.$outidxfile;
      } else {
	$page_to_tmpfile{$u} = $tmpfile;
      }

      if ($levels{$url} >= 0) {
	&download_front_page ($u, $url, $levels{$url});
      } else {
	# just read the text and write it to a file
	&download_story_page ($u, $url, 1);
      }
    }
    &set_got_intr_behaviour ('exit');

    if ($stories_found > 0) {
      &verbose ("$secttitle: $stories_found stories downloaded (".
	      sprintf ("%3.1f", $file_size/1024)." K uncompressed).");

      # convert sitescooper navigation links: [<-][->]
      {
	my $ind = $main::current_story_index;
	# trim off the first and last ones anyway
	$main::output_file =~
	    s/\[<a href=\"__SITESCOOPER_STORY_(-1|${ind})\">.*?<\/a>\]//g;
	# and run through the rest
	for ($i = 0; $i < $ind; $i++) {
	  $main::output_file =~
	  s/\"__SITESCOOPER_STORY_${i}\"/\"${main::output_story_urls[$i]}\"/g;
	}
      }

      $main::output_file = &remove_external_links ($main::output_file);

      my $blurb1 = "(End of snarf - copyright retained by original ".
	"providers.";
      my $blurb2 = "Downloaded and converted by sitescooper; see ".
	"$main::home_url )";

      if ($main::outstyle == $OUT_HTML) {
	$main::output_file .= "<p><hr><i>$blurb1 $blurb2</i></body></html>\n";
      } elsif ($main::outstyle == $OUT_DOC) {
	$main::output_file .= "$blurb1 $blurb2\n<$main::bookmark_char>\n";
      } else {
	$main::output_file .= "$blurb1\n$blurb2\n";
      }

      open (OUTFILE, "> $main::output_filename")
		      or die "Failed to create $main::output_filename\n";
      print OUTFILE $main::output_file;
      close OUTFILE or warn "Failed to write to $tmpfile";

      if (!$main::fileperpage) {
	if ($main::dump) {
	  open (IN, "< $tmpfile");
	  while (<IN>) { print STDOUT; }
	  close IN; unlink ($tmpfile);
	  # ensure we don't try to convert it later
	  undef $main::key2syncfile{$filekey};

	} else {
	  unlink ($outfile); rename ($tmpfile, $outfile);
	  if ($convert_now) { &convert_output($filekey); }
	}

      } else {
	&rmtree ($outfile); rename ($tmpfile, $outfile);

	if ($main::dump) {
	  # print the path to the created directory containing the pages
	  print $outfile."\n";
	  # ensure we don't try to convert it later
	  undef $main::key2syncfile{$filekey};

	} else {
	  if ($convert_now) { &convert_output($filekey); }
	}
      }

      my ($from, $to);
      while (($from,$to) = each %main::oldest_already_seen_this_site) {
	&main::dbg ("Saving new already_seen age cache entry: $from => $to ".
		"(".&time2datestr($to).")");
	$main::oldest_already_seen{$from} = $to;
      }

    } else {
      close OUTFILE;
      &verbose ("$secttitle: no new stories, ignoring.");
      &main::dbg ("(Not setting already_seen age cache since no links were followed)");

      undef $main::key2syncfile{$filekey};
      if ($main::fileperpage) {
	&rmtree ($tmpfile);
      } else {
	unlink $tmpfile;
      }
    }

    $main::output_file = '';
  }
}

# ---------------------------------------------------------------------------

sub convert_output {
  my $filekey = shift;

  return unless ($main::use_convert_tool);

  my $syncfile = $key2syncfile{$filekey};
  return unless defined $syncfile;

  $outfile = $key2outfile{$filekey};
  my $outidxfile = $key2outidxfile{$filekey};
  my $secttitle = $key2title{$filekey};
  unlink $syncfile;

  if ($main::outputfilter eq '__cat__') {
    open (IN, "< ".$outfile);
    while (<IN>) { print STDOUT; }
    close IN;
    unlink $outfile;
    return;
  }

  if ($main::outputfilter eq 'makedoc') {
    $cmd = "$makedoc $outfile ".$syncfile." '".$secttitle."'";

  } elsif ($main::outputfilter eq 'isilo') {
    if ($main::fileperpage) {
      $cmd = "$isilo $isiloargs $isilomultipageargs ".
      				$outfile.$slash.$outidxfile;
    } else {
      $cmd = "$isilo $isiloargs ".$outfile;
    }

    # UNIX iSilo utils take the output filename as well; Win32
    # doesn't need it as it installs as it goes along.
    if (&Portability::MyOS ne 'Win32') {
      $cmd .= " ".$syncfile;
    }

    # Win32 iSilo only takes the -u arg for the GUI version, not the
    # command line one. Strip the arg for the command-line converter.
    if (&Portability::MyOS eq 'Win32') {
      if ($cmd =~ /isiloc32/i) { $cmd =~ s/ -u / /g; }
    }

  } elsif ($main::outputfilter eq 'richreader') {
    $cmd = "$richreader $richargs ".$outfile;

    # UNIX HTML2Doc utils take the output filename as well; Win32
    # doesn't need it as it installs as it goes along.
    if (&Portability::MyOS ne 'Win32') {
      $cmd .= " ".$syncfile;
    }

  } elsif ($main::outputfilter =~ /^cmd: /) {
    $cmd = $';
    $cmd =~ s/__SCOOPFILE__/${outfile}/g;
    $cmd =~ s/__SYNCFILE__/${syncfile}/g;
    $cmd =~ s/__TITLE__/${secttitle}/g;

  } else {
    die "bad output filter $main::outputfilter\n";
  }

  if (&Portability::MyOS ne 'Mac') {
    &add_cmd_dir_to_path ($cmd);
    &verbose ("Running: $cmd");
    system $cmd;
    # output a newline, MakeDoc won't do it itself.
    if (&Portability::MyOS eq 'UNIX') { &verbose ("\n"); }

    if (($? >> 8) != 0) {

      # work around a bug in iSilo converter on Win32 -- it
      # reports failure even when the conversion went fine.
      if (&Portability::MyOS ne 'Win32' ||
	    $main::outputfilter ne 'isilo')
      {
	warn "command failed: $cmd\n";
	$main::failed_to_cvt = 1;
      }
    }

  } else {
    # system is broken on MacOS, so print the required command 
    #so it can be run easily from MPW shell
    if (!defined $macos_system_warning_written) {
      warn "[Warning: not using the broken MacPerl system() call. ".
	    "You will need to\ncut and paste the command ".
	    "lines yourself!]\n\n";
      $macos_system_warning_written = 1;
    }
    print $cmd, "\n";
  }

  if (!$main::debug && !$main::keep_tmps) {
    &rmtree ($outfile);		# don't keep .txt files around
  }

  # If we're dumping, read in the generated file and write it to
  # STDOUT.
  if ($main::dumpprc) {
    open (IN, "< ".$syncfile);
    while (<IN>) { print STDOUT; }
    close IN;
    unlink $syncfile;
  }

  # output the name of the finished file. This is handy for scripts
  # which want to collect these files and store them somewhere.
  # REVISIT -- verify that Win32 iSilo uses the same filename.
  if (!$main::dumpprc) {
    print "Created: ".$syncfile."\n";
  }
}

sub add_cmd_dir_to_path {
  local ($_);
  my $cmd = shift;

  # Perl on some Win32 platforms seems to require that the binary be
  # in the PATH.
  #
  if (&Portability::MyOS eq 'Win32') {
    $_ = $cmd;
    if (!/[\\\/:]/) { return; }	# foo arg ...
    if (/^\"([^\"]+)\"/) { $cmd = $1; }	# "C:\Program Files\foo.exe" arg ...
    elsif (/^(\S+)\s/) { $cmd = $1; }	# C:\windows\foo.exe arg ...
    else { $cmd = $_; }			# C:\windows\foo.exe

    $cmd =~ s,[\\/][^\\/]+\s*$,,g;		# trim the filename
    $cmdpat = $cmd; $cmdpat =~ s,(\W),\\$1,g;	# escape funny chars

    if ($ENV{'PATH'} !~ /;${cmdpat}(;|$)/) {
      &main::dbg ("Adding directory to command path: $cmd");
      my $path = $ENV{'PATH'} || $ENV{'Path'} || $ENV{'path'};
      $path .= ";$cmd"; $ENV{'PATH'} = $path;
    }
  }
}

# ---------------------------------------------------------------------------

sub write_state {
  if (!$main::failed_to_cvt) {
    # only write alreadyseen if the files converted successfully, otherwise
    # the user may lose some recent news due to a makedoc screwup.
    #
    my $towrite = '';
    my $now = time;
    my $twomonthsago = $now - (24*60*60*30*2);
    my $mod;
    my $urlhost;

    # keep the already-seen list small by cutting out old entries.  We
    # define "old entries" as (a) older than 2 months and (b) older than
    # the oldest link we saw in today's scooping run.
    #
    if (!$main::refresh) {
      &main::dbg ("trying to cut old entries from already-seen URL cache");

      foreach $_ (keys %already_seen) {
	m,http://(\S+?)/,; $urlhost = $1;
	if (defined $last_modtime{$_} && defined $main::oldest_already_seen{$urlhost})
	{
	  $mod = $last_modtime{$_};
	  if ($twomonthsago > $mod && $main::oldest_already_seen{$urlhost} > $mod) {
	    &main::dbg ("stripping old entry: $_ lastmod=$mod (".&time2datestr($mod).")");
	    next;
	  }
	}
	$towrite .= $_." lastmod=".(defined $last_modtime{$_}
		    ? $last_modtime{$_} : $now)."\n";
      }

      if (open (OUT, "> $newalreadyseen")) {
	print OUT $towrite;	# do it as one big atomic write, for safety
	close OUT || warn "Cannot rewrite $newalreadyseen\n";
      } else {
	warn "Cannot rewrite $newalreadyseen\n";
      }

    } else {
      # it's small enough -- so we can just append to it.
      &main::dbg ("appending already-seen URLs to $newalreadyseen");

      foreach $_ (@seen_this_time) {
	$towrite .= $_." lastmod=".(defined $last_modtime{$_}
		    ? $last_modtime{$_} : $now)."\n";
      }

      if (open (OUT, ">> $newalreadyseen")) {
	print OUT $towrite;	# do it as one big atomic write, for safety
	close OUT || warn "Cannot append to $newalreadyseen\n";
      } else {
	warn "Cannot append to $newalreadyseen\n";
      }
    }


    my ($from, $to);
    while (($from,$to) = each %caches_to_rename) {
      &main::dbg ("Saving new cache file: $to");
      rename ($from, $to) or warn ("rename $from -> $to failed\n");
    }
  }
}

# ---------------------------------------------------------------------------
# Note on levels: a 2-level site has a contents page and stories off that;
# 3-level has issue links page, per-issue contents page and stories.
# 1-level has only the story page, no links.

sub download_front_page {
  my $url = shift;
  my $baseurl = shift;
  my $level = shift;
  my ($cachefile, $page);
  my $key = "$level $baseurl";

  $sitewarn_current_site_line = $site_defined_at{$baseurl};

  # Use this hash to avoid endless loops when scooping multi-page front pages.
  return if (defined $already_seen_this_session{$url});
  $already_seen_this_session{$url} = 1;

  if (defined $links_limit_to{$key}) {
    if (!match_url ($url, $links_limit_to{$key})) {
      &dbg ("front page URL $url does not match ".${links_limit_to{$key}}.", ignoring.");
      return;
    }
  }

  if (defined $links_skip{$key}) {
    if ($url =~ m#^${links_skip{$key}}$#) {
      &verbose ("Skipping: $url"); return;
    }
  }

  my $origurl = $url;
  $url = &apply_url_postproc($url, $baseurl);
  if (!defined $url) {
    &main::dbg ("URLProcess says URL should be ignored: $origurl"); return;
  }

  &verbose ("Reading level-".($level+2)." front page: $url");
  &set_got_intr_behaviour ('setflag');

  my $is_dynamic_html;
  if (defined $cacheable{$key}) {
    $is_dynamic_html = ($cacheable{$key} == 0);
  } elsif (defined $links_diff{$key} && $links_diff{$key} != 0) {
    $is_dynamic_html = 1;	# pages that need diff'ing are dynamic
  } elsif ($level < $levels{$baseurl}) {
    # second-level or deeper front pages are usually not dynamic, more
    # likely to be a static table of contents.
    $is_dynamic_html = 0;
  } else {
    $is_dynamic_html = 1;	# front pages are usually dynamic
  }

  push (@seen_this_time, $url);
  $already_seen {$url} = 1;
  &check_for_oldest ($url);	# we came across the link, so keep it around

  $page = &get_page ($url, $is_dynamic_html);
  return unless defined $page;
  if ($got_intr_flag) { goto interrupted; }

  if (defined $last_modtime{$url} && $last_modtime{$url}
  	< $story_lifetime{$baseurl} * 24 * 60 * 60)
  {
    &verbose ("Skipping (contents are older than ".
    		$story_lifetime{$baseurl}." days): $url");
    return;
  }

  my $origpage = $page;
  &journal ("pre_strip_level".($level+2), $page);
  $page = &strip_front_page ($url, $key, $baseurl, $page);
  &journal ("post_strip_level".($level+2), $page);

  my $cachedpage;
  if (defined $links_diff{$key} && $links_diff{$key} != 0) {
    $cachedpage = &strip_front_page ($url, $key, $baseurl,
				&get_cached_page_for_diff ($url));
    $page = &get_new_bits ($cachedpage, $page);
    &cache_page_later ($url, $origpage);

  } else {
    &cache_page_later ($url, $origpage);
  }

  if (defined fileno JOURNAL) {
    # always write a text-mode version for the journal
    &journal ("to_text_level".($level+2),
    	&html_to_text ($url, $baseurl, $page, $OUT_TEXT));
  }

  if ((defined $links_print{$key} && $links_print{$key} != 0) ||
    	&writing_html)
  {
    $main::warn_about_external_links = 1;
    my $txtpage = &html_to_text ($url, $baseurl, $page, $main::outstyle);
    $main::warn_about_external_links = 0;

    my $outme = 1;
    if ($is_dynamic_html && defined $cachedpage && !$main::refresh) {
      # ensure that the cleaned-up HTML doesn't match the cleaned-up cached
      # HTML. Sometimes the ad banners will be the only things that have
      # changed between retrieves, and html_to_text will have stripped those
      # out.
      my $cachedtxt = &html_to_text ($url, $baseurl, $cachedpage,
      			$main::outstyle);
      if (&text_equals ($txtpage, $cachedtxt)) {
	&verbose ("Not printing contents (text has not changed): $url");
	$outme = 0;
      }
    }

    if ($outme) {
      &verbose ("Printing: $url");
      &write_as_story (1, $url, $baseurl, $txtpage, undef);
    }

    if (&writing_html && (!defined $links_print{$key} || $links_print{$key} == 0))
    {
      # don't count the front page as a story if we're just outputting it
      # because we're writing HTML.
      $stories_found--;
    }
  }

  # see if there's any links to extra contents pages
  my @turnoverlinks = &get_contents_turnover_links ($url, $key, $page);

  my @links = ();
  my $wrote_sep = 0;

  # This was all getting a bit tricky, so I've redone it a bit.
  # It now does not try to strip closing tags, as it doesn't have to.
  while (1) {
    if ($got_intr_flag) { goto interrupted; }

    if (
      $page =~ s/<a\s+[^>]*href=\"([^\"%>]+)\"//is
      ||
      $page =~ s/<a\s+[^>]*href=([^\s>]+)//is
      )
    {
      push (@links, $1);
      next;
    }

    # support for frames
    if (
      $page =~ s/<frame\s+[^>]*src=\"([^\">]+)\"//is
       ||
      $page =~ s/<frame\s+[^>]*src=([^\s+>]+)//is
      )
    {
      my $link = $1;
      if (&writing_html) {
	if ($wrote_sep == 0) {
	  $main::output_file .= "<p><hr>\n"; $wrote_sep = 1;
	}
	$main::output_file .=
		&translate_link ($url, $baseurl, $link, $link). "<br>\n";
      }
      push (@links, $link);
      next;
    }

    # rudimentary support for My-Netscape-style RDF files
    if ($page =~ s/<item>(.*?)<link\s*[^>]*>(.+?)<\/link>(.*?)<\/item>//is)
    {
      my ($title, $link, $title2) = ($1, $2, $3);

      # <link> tags in RSS can contain other crap. Ditch it; we want the link!
      $link =~ s/^.*<url>(.*?)<\/url>.*$/$1/gis;

      $link = &AbsoluteURL ($url, $link);
      if ($title =~ /<title>(.*?)<\/title>/is
	   || $title =~ /<text>(.*?)<\/text>/is
	   || $title2 =~ /<title>(.*?)<\/title>/is
	   || $title2 =~ /<text>(.*?)<\/text>/is)
      {
	$url_title{$link} = $1;
      }

      push (@links, $link);
      next;
    }

    last;		# no more links available
  }

  if ($#links >= 0) {
    &verbose ("Found ".($#links+1)." links, examining them.");
  }

  # now traverse the links and get the stories
  &journal ("links_level".($level+2), join ("\n", @links));
  foreach $_ (@links) {
    if ($hit_file_size_limit) {
      my $msg = "File size limit of $main::filesizelimit K exceeded,".
			  " skipped some stories from this site.";
      &verbose ($msg);
      if (&writing_html) {
	$main::output_file .= "<hr><i>$msg</i><br>\n";
      } else {
	$main::output_file .= "\n($msg)\n";
      }
      last;
    }

    &follow_front_link ($baseurl, $url, $level, $_);
    if ($got_intr_flag) { goto interrupted; }
  }

  # if there's more contents pages, process them as well.
  &journal ("turnover_links_level".($level+2), join ("\n", @turnoverlinks));
  if ($#turnoverlinks >= 0) {
    my $link;
    for $link (@turnoverlinks) {
      if ($got_intr_flag) { goto interrupted; }
      $link = &AbsoluteURL ($url, $link);
      &download_front_page ($link, $baseurl, $level);
    }
  }

interrupted:
  &set_got_intr_behaviour ('exit');
}

# ---------------------------------------------------------------------------

sub follow_front_link {
  my ($baseurl, $url, $level, $nextpage) = @_;
  $nextpage = &AbsoluteURL ($url, $nextpage);

  &main::dbg ("Link found on $baseurl: $nextpage");

  # should we download the next front page?
  if (defined $links_start{($level-1)." $baseurl"}) {
    &download_front_page ($nextpage, $baseurl, $level-1);
    return;
  }
  if ($got_intr_flag) { return; }

  # nope, we're onto the stories already
  $nextpage = &make_printable ($baseurl, $nextpage, 1);

  &download_story_page ($nextpage, $baseurl, 0);
}

sub make_printable {
  my $baseurl = shift;
  my $nextpage = shift;
  my $warn_if_fail = shift;

  if (defined $printable_sub{$baseurl}) {
    my $new = $nextpage;
    my $sub = $printable_sub{$baseurl};
    $sub =~ s/\\(\d+)/\$$1/g;	# avoid warnings

    eval '$new =~ '.$sub.'; 1;'
      or &sitewarn ("Printable substitution failed! ($!)\n");

    if ($nextpage ne $new) {
      # &verbose ("Using printable version instead: $new");
      if (defined $story_limit_to{$baseurl} &&
		!match_url ($new, $story_limit_to{$baseurl}))
      {
	if ($warn_if_fail) {
	  &sitewarn ("Printable version does not match StoryURL".
		"pattern, reverting to $nextpage\n");
	}
      } else {
	$nextpage = $new;
      }
    }
  }
  $nextpage;
}

# ---------------------------------------------------------------------------

sub download_story_page {
  my $url = shift;
  my $baseurl = shift;
  my $is_dynamic_html = shift;
  my ($cachefile, $page);

  $url =~ s/#.*$//g;		# strip references
  study $url;

  $sitewarn_current_site_line = $site_defined_at{$baseurl};

  my $cacheflag = $cacheable{"s $baseurl"};
  if (defined $cacheflag) {
    # user setting overrides our heuristics
    $is_dynamic_html = ($cacheflag==0);
  }
  if (defined $story_diff{$baseurl} && $story_diff{$baseurl}) {
    $is_dynamic_html = 1;	# diff pages are always dynamic
  }

  if (defined $story_limit_to{$baseurl}) {
    if (!defined $output_storyurl_dbg{$baseurl}) {
      &main::dbg ("StoryURL for $baseurl: ${story_limit_to{$baseurl}}");
      $output_storyurl_dbg{$baseurl} = 1;
    }

    if (!match_url ($url, $story_limit_to{$baseurl})) {
      &main::dbg ("Non-story URL ignored: $url");
      return;
    }
  }
  if ($url =~ m,^(ftp|mailto|https|gopher|pnm)://,) {
    &main::dbg ("Non-story URL ignored: $url");
    return;
  }

  my $origurl = $url;
  $url = &apply_url_postproc($url, $baseurl);
  if (!defined $url) {
    &main::dbg ("URLProcess says URL should be ignored: $origurl"); return;
  }

  if (defined $story_skip{$baseurl}) {
    if ($url =~ m#^${story_skip{$baseurl}}$#) {
      &verbose ("Skipping: $url"); return;
    }
  }

  &check_for_oldest ($url);

  if (!$is_dynamic_html && $already_seen {$url}) {
    &main::dbg ("skipping, already seen: $url");
    return;
  }

  push (@seen_this_time, $url);
  $already_seen {$url} = 1;

  &get_story_page ($url, $baseurl, $is_dynamic_html);
}

# ---------------------------------------------------------------------------

sub get_story_page {
  my $url = shift;
  my $baseurl = shift;
  my $is_dynamic_html = shift;
  my @turnoverlinks;
  my $headline;

  &verbose ("Reading: $url");
  &check_for_oldest ($url);	# we came across the link, so keep it around

  my $cachedpage = undef;
  if (defined $story_diff{$baseurl} && $story_diff{$baseurl}) {
    $cachedpage = &get_cached_page_for_diff ($url);
  } elsif ($is_dynamic_html) {
    $cachedpage = &get_cached_page ($url);
  }
  if (defined $cachedpage) {
    $cachedpage = &strip_story ($url, $baseurl, $cachedpage, " (cached)");
  }

  my $origpage = &get_page ($url, $is_dynamic_html);
  return unless defined $origpage;
  if ($got_intr_flag) { return; }

  # get headline before stripping StoryStart and StoryEnd
  $headline = &get_headline ($url, $baseurl, $origpage);
  &journal ("pre_strip_story", $origpage);
  my $page = &strip_story ($url, $baseurl, $origpage, "");
  &journal ("post_strip_story", $page);

  if (defined $story_diff{$baseurl} && $story_diff{$baseurl}) {
    $page = &get_new_bits ($cachedpage, $page);
    &cache_page_later ($url, $origpage);
  } else {
    &cache_page ($url, $origpage);
  }

  if ($got_intr_flag) { return; }

  if (defined fileno JOURNAL) {
    # always write a text-mode version for the journal
    &journal ("to_text_story",
    	&html_to_text ($url, $baseurl, $page, $OUT_TEXT));
  }

  # get turn-over links after stripping StoryStart and StoryEnd
  @turnoverlinks = &get_story_turnover_links ($url, $baseurl, $page);
  $main::warn_about_external_links = 1;
  $page = &html_to_text ($url, $baseurl, $page, $main::outstyle);
  $main::warn_about_external_links = 0;

  if ($is_dynamic_html && defined $cachedpage && !$main::refresh) {
    # ensure that the cleaned-up HTML doesn't match the cleaned-up cached
    # HTML. Sometimes the ad banners will be the only things that have
    # changed between retrieves, and html_to_text will have stripped those
    # out.
    $cachedpage = &html_to_text ($url, $baseurl,
    			$cachedpage, $main::outstyle);
    if (&text_equals ($page, $cachedpage)) {
      &verbose ("Skipping (text has not changed): $url");
      return;
    }
  }

  if (defined $last_modtime{$url} && $last_modtime{$url}
  	< $story_lifetime{$baseurl} * 24 * 60 * 60)
  {
    &verbose ("Skipping (story is older than ".
    		$story_lifetime{$baseurl}." days): $url");
    return;
  }

  # ensure there's some alphanumerics in the output text. No alnums means
  # no output. HTML needs to be checked to ensure we don't just pick
  # up tags, which will not be displayed.
  if ((&writing_html && $page !~ /[A-Za-z0-9]\s*</ &&
    		$page !~ />\s*[A-Za-z0-9]/ && $page !~ /^\s*[A-Za-z0-9]/)
    || (!&writing_html && $page !~ /[A-Za-z0-9]/))
  {
    &verbose ("Skipping (no text to write): $url");
    return;
  }

  if ($levels{$baseurl} < 0) {
    # this is a one-level site: therefore the story should be treated
    # as the "front page". Thx Carsten for this one.
    &write_as_story (1, $url, $baseurl, $page, $headline);
  } else {
    &write_as_story (0, $url, $baseurl, $page, $headline);
  }

  &journal ("turnover_links_story", join ("\n", @turnoverlinks));
  if ($#turnoverlinks >= 0) {
    my $link;
    for $link (@turnoverlinks) {
      if ($got_intr_flag) { return; }
      $link = &AbsoluteURL ($url, $link);
      &download_story_page ($link, $baseurl, 0);	# right now
    }
  }
}

# ---------------------------------------------------------------------------

sub apply_url_postproc {
  local ($_) = shift;
  my $baseurl = shift;

  if (defined $url_postproc{$baseurl}) {
    if (!eval $url_postproc{$baseurl}."; 1;") {
      &sitewarn("URLProcess failed: $@");
      undef $_;
    }
  }
  $_;
}

# ---------------------------------------------------------------------------

sub clean_pre_tags_for_diff {
  my $file = shift;
  my $pre_nl_tag = shift;
  my $pre_pre_tag = shift;
  my $pre_slashpre_tag = shift;

  my $start = '';
  my $end = '';

  ($file =~ s/^(.*)<pre>//i) and $start = $1;
  ($file =~ s/<\/pre>(.*)$//i) and $end = $1;
  $file =~ s/\n/${pre_nl_tag}/gs;

  $start.$pre_pre_tag.$file.$pre_slashpre_tag.$end;
}

sub get_new_bits {
  local ($_);
  my ($oldfile, $newfile) = @_;

  if (!defined $oldfile || $oldfile =~ /^\s*$/) {
    if (!$main::debugdiffs) { return $newfile; }
    $oldfile = '';
  }

  if ($main::refresh) {
    &verbose ("-refresh is on, not looking for differences");
    return $newfile;
  }

  &verbose ("Finding differences between current page and cached version");

  # it's important to keep these names 8.3 for Windows-95 compatibility,
  # as some Windoze diffs may not be able to handle them otherwise!
  # This also requires that we are chdir'd into the temporary directory
  # to avoid hassles with long filenames in the args when we run the
  # diff command. What a pain!
  #
  my $oldf = "a$$.tmp";		# we are already chdir'ed
  my $newf = "b$$.tmp";

  if ($main::debugdiffs) {
    $oldf = "diff_old.tmp";
    $newf = "diff_new.tmp";
  }
  open (F1, "> $oldf") || warn "cannot write to $oldf\n";
  open (F2, "> $newf") || warn "cannot write to $newf\n";

  # Split the file lines at probable story-header endpoints.
  # This makes them more amenable to diffing, hopefully without
  # losing bits we don't want to lose, or gaining bits we don't
  # want to gain. Also try to keep cross-line-split HTML tags
  # together.

  # preserve newlines in <pre> text
  my $cleaned_pre_nls = 0;
  my $pre_nl_tag = "<!!!n>";
  my $pre_pre_tag = "<!!!pre>";
  my $pre_slashpre_tag = "<!!!/pre>";

  while ($oldfile =~ /<pre>/i) {
    $oldfile = &clean_pre_tags_for_diff ($oldfile,
    			$pre_nl_tag, $pre_pre_tag, $pre_slashpre_tag);
    $cleaned_pre_nls = 1;
  }

  while ($newfile =~ /<pre>/i) {
    $newfile = &clean_pre_tags_for_diff ($newfile,
    			$pre_nl_tag, $pre_pre_tag, $pre_slashpre_tag);
    $cleaned_pre_nls = 1;
  }

  # canonicalise all other newlines (we control the vertical!)
  $oldfile =~ s/\s*[\r\n]+\s*/ /gs;
  $newfile =~ s/\s*[\r\n]+\s*/ /gs;

  # remove extraneous whitespace from inside tags
  $oldfile =~ s/<\s*([^>]+?)\s*>/ $_=$1; s,\s+, ,gs; "<$_>"; /gies;
  $newfile =~ s/<\s*([^>]+?)\s*>/ $_=$1; s,\s+, ,gs; "<$_>"; /gies;

  # handle the two types of <p> tags -- <p>...</p>, and just ...<p>
  $oldfile =~ s/<p( *[^>]*>.*?<\/p *[^>]*>)/\n<!!!p$1\n/gi;
  $newfile =~ s/<p( *[^>]*>.*?<\/p *[^>]*>)/\n<!!!p$1\n/gi;

  $oldfile =~ s/(<p *[^>]*>)/$1\n/gi;
  $newfile =~ s/(<p *[^>]*>)/$1\n/gi;

  $oldfile =~ s/<!!!p/<p/gi;
  $newfile =~ s/<!!!p/<p/gi;

  # put newline before these tags (thx Carsten Clasohm, again!)
  $oldfile =~ s/(<(?:table|tr|td|div|item) *[^>]*>)/\n$1/gi;
  $newfile =~ s/(<(?:table|tr|td|div|item) *[^>]*>)/\n$1/gi;
  # after these ones
  $oldfile =~ s/(<(?:br|hr|table|\/td|\/table|\/tr|\/div) *[^>]*>)/$1\n/gi;
  $newfile =~ s/(<(?:br|hr|table|\/td|\/table|\/tr|\/div) *[^>]*>)/$1\n/gi;

  # remove newlines inside <a href> tags. Thx to Carsten Clasohm.
  1 while $oldfile =~ s/(<a href=[^>]+>([^\n<]|<(?!\/a>))*)\n+/$1 /gis;
  1 while $newfile =~ s/(<a href=[^>]+>([^\n<]|<(?!\/a>))*)\n+/$1 /gis;

  if ($cleaned_pre_nls) {
    $oldfile =~ s/${pre_nl_tag}/\n/g; $oldfile =~ s/${pre_pre_tag}/<pre>/g;
    $oldfile =~ s/${pre_slashpre_tag}/<\/pre>/g;
    $newfile =~ s/${pre_nl_tag}/\n/g; $newfile =~ s/${pre_pre_tag}/<pre>/g;
    $newfile =~ s/${pre_slashpre_tag}/<\/pre>/g;
  }

  print F1 $oldfile; close F1;
  print F2 $newfile; close F2;

  my $page = '';
  open (DIFF, "$diff $oldf $newf |") || warn "cannot run $diff\n";
  while (<DIFF>) {
    /^>/ || next;
    $page .= $';
  }
  close DIFF;		# ignore exit status -- exit 1 only means no diffs.

  if ($main::debugdiffs) {
    open (F1, "> diff_out.tmp"); print F1 $page; close F1;
    warn "$diff $oldf $newf, breaking for debug"; &cleanexit;
  }

  unlink $oldf; unlink $newf;
  $page;
}

# ---------------------------------------------------------------------------

sub text_equals {
  my $t1 = shift;
  my $t2 = shift;
  $t1 =~ s/[\s\r\n]+/ /gs; $t1 =~ s/^\s+//; $t1 =~ s/\s+$//;
  $t2 =~ s/[\s\r\n]+/ /gs; $t2 =~ s/^\s+//; $t2 =~ s/\s+$//;
  ($t1 eq $t2);
}

# ---------------------------------------------------------------------------
# Strip a story page from StoryStart to StoryEnd.
# In addition, strip out non-story sidebar table items
# and carriage returns (they confuse plenty of regexps later).
#
sub strip_story {
  my $url = shift;
  my $baseurl = shift;
  my $page = shift;
  my $comment = shift;

  if (!defined $page) { return undef; }

  # ok, now strip the headers and footers
  my $pat = $story_start{$baseurl}; if (defined $pat) {
    if ($page =~ /${pat}.*${pat}/) {
      &sitewarn("StoryStart pattern \"$pat\" found multiple times in page $url$comment\n");
    }
    if ($page =~ s#^.*?${pat}##gs) {
      if (defined fileno JOURNAL) { &journal ("pre_stripped", $&); }
      $page =~ s#^[^<]*?>##gs;		# strip superfluous ends of tags
      if (defined fileno JOURNAL) { &journal ("pre_stripped", $&); }
    } else {
      &sitewarn("StoryStart pattern \"$pat\" not found in page $url$comment\n");
    }
  }

  $pat = $story_end{$baseurl}; if (defined $pat) {
    if ($page =~ /${pat}.*${pat}/) {
      &sitewarn("StoryEnd pattern \"$pat\" found multiple times in page $url$comment\n");
    }
    if ($page =~ s#${pat}.*?$##gs) {
      if (defined fileno JOURNAL) { &journal ("post_stripped", $&); }
      $page =~ s#<[^>]*?$##gs;		# strip superfluous starts of tags
      if (defined fileno JOURNAL) { &journal ("post_stripped", $&); }
    } else {
      &sitewarn("StoryEnd pattern \"$pat\" not found in page $url$comment\n");
    }
  }

  # &smart_clean_table only operates on table items with size specifications.
  # TODO -- this does not handle <td width=100>...<td>foo</td><td>bar</td></td>
  #
  $page =~ s/<td\s+([^>]+)>(.*?)<\/td>/
		&smart_clean_table ($baseurl, $1, $2, $baseurl);
	/gies;

  $page =~ s/\r/ /g;	# strip CRs
  $page;
}

sub strip_front_page {
  my $url = shift;
  my $key = shift;
  my $baseurl = shift;
  my $page = shift;

  if (!defined $page) { return undef; }

  my $pat = $links_start{$key}; if (defined $pat) {
    if ($page =~ /${pat}.*${pat}/) {
      &sitewarn("ContentsStart pattern \"$pat\" found multiple times in page $url\n");
    }
    ($page =~ s#^.*?${pat}##gs) ||
	&sitewarn("ContentsStart pattern \"$pat\" not found in page $url\n");
    $page =~ s#^[^<]*?>##gs;		# strip cut-in-half tags
  }
  $pat = $links_end{$key}; if (defined $pat) {
    if ($page =~ /${pat}.*${pat}/) {
      &sitewarn("ContentsEnd pattern \"$pat\" found multiple times in page $url\n");
    }
    ($page =~ s#${pat}.*?$##gs) ||
	&sitewarn("ContentsEnd pattern \"$pat\" not found in page $url\n");
    $page =~ s#<[^>]*?$##gs;		# strip cut-in-half tags
  }

  $page =~ s/<td\s+([^>]+)>(.*?)<\/td>/
		&smart_clean_table($baseurl, $1, $2, $key);
	/gies;

  $page =~ s/\r/ /g;	# strip CRs
  $page;
}

# ---------------------------------------------------------------------------

sub get_headline {
  my $url = shift;
  my $baseurl = shift;
  my $page = shift;

  my $headline;

  if (defined $url_title{$url}) {
    $headline = &html_to_text ($url, $baseurl,
    		$url_title{$url}, $OUT_TEXT);
    &main::dbg ("StoryHeadline: (from RDF): $headline");

  } elsif (defined $head_pat{$baseurl}) {
    my $pat = $head_pat{$baseurl};
    if ($page !~ m#${pat}#m) {
      &sitewarn("StoryHeadline pattern \"$pat\" not found in page $url\n");
    } elsif (defined $1) {
      $headline = &html_to_text ($url, $baseurl, $1, $OUT_TEXT);
      # &main::dbg ("StoryHeadline: $headline"); # logged later on anyway
    } else {
      &sitewarn("StoryHeadline pattern \"$pat\" does not contain brackets!\n");
    }

  } elsif ($page =~ m#<meta name="PCTITLE" content="(.*)">#mi) {
    # try a fallback: search for PointCast headline tags
    $headline = &html_to_text ($url, $baseurl, $1, $OUT_TEXT);
    &main::dbg ("StoryHeadline (default, PointCast): $headline");
  }

  $headline;
}

# ---------------------------------------------------------------------------

sub get_story_turnover_links {
  my $url = shift;
  my $baseurl = shift;
  my $page = shift;

  my @turnoverlinks = ();

  while ($page =~ s,<a\s+[^>]*href=(?:\"|%22)?([^>]+)(?:\"|%22)?>(.+?)</a>,,i)
  {
    my $link = $1;
    my $txt = $2;

    if (defined $story_follow_links {$baseurl} &&
      		$story_follow_links {$baseurl})
    {
      push (@turnoverlinks, $link);

    } elsif ($txt =~ m,(more|next|\d+ of \d+|&gt;&gt;),i) {
      my $urlguts = '.';
      ($baseurl =~ /^http:\/\/\S+\.([^\.\/]+\.[^\.\/]+\/).*$/) and
	  ($urlguts = $1);

      if (($txt !~ /[a-z0-9] [a-z0-9]+ [a-z0-9]+ [a-z0-9]/i) # 5 or more words
	  && (length ($txt) < 15)
	  && $link =~ m/$urlguts/)
      {
	push (@turnoverlinks, $link);
	$txt =~ s/[\n\r]+/ /g;
	&verbose ("(Following 'next page' link: \"$txt\")");
      }
    }
  }

  @turnoverlinks;
}

# ---------------------------------------------------------------------------

sub get_contents_turnover_links {
  my $url = shift;
  my $key = shift;
  my $page = shift;

  if (!defined $links_follow_links{$key} || !$links_follow_links{$key}) {
    return ();
  }

  my @turnoverlinks = ();

  while ($page =~ s,<a\s+[^>]*href=(?:\"|%22)?([^>]+)(?:\"|%22)?>(.+?)</a>,,i)
  {
    my $link = $1;
    my $txt = $2;

    if (defined $links_follow_links {$key} &&
      		$links_follow_links {$key})
    {
      push (@turnoverlinks, $link);
    }
    # we don't do the automatic "more/next/page x of y" stuff
    # that we do with the story pages
  }

  @turnoverlinks;
}

# ---------------------------------------------------------------------------

sub remove_an_ext_link {
  my ($link, $text, $ahref, $posthref) = @_;

  if (!&writing_html) {
    return $text;
  }

  if (defined ($main::output_links_snarfed{$link}) || $main::nolinkrewrite) {
    $ahref.$link.$posthref.$text."</a>";
  } else {
    &dbg ("Removing non-snarfed link: $link");
    $text;		# without <a href=...> </a>
  }
}

sub remove_external_links {
  local ($_) = $_[0];

  # &dbg (join(' ', sort keys %main::output_links_snarfed));

  s/(<a\s+[^>]*href=\")([^\"]+)(\"[^>]*?>)(.*?)<\/a>/
	  &remove_an_ext_link ($2, $4, $1, $3);
      /gies;
  $_;
}

# We could do this smarter, but it looks really gross when converted to
# DOC format -- and this tool is primarily for that conversion. Sorry!
# This also works well for iSilo, because iSilo's rendering of <pre> text
# is pretty rotten.
#
sub clean_preformatted_text {
  my $txt = shift;
  $txt =~ s/[ \t]+\n/\n/g;
  $txt =~ s/<(|\/)(pre|code)>//g;	# strip extra <pre> tags!

  # convert blank lines to a paragraph separator.
  $txt =~ s/\n{1,}\n/<p>\n\n/g;

  # The idea with this one is to add a <br> at the end of lines shorter
  # than 50 columns, and conversely to allow lines longer than 50 cols to
  # run into the next line as if they were part of a paragraph.  I'm not
  # sure about it, but a lot of <pre> sites are just copies of emails, so
  # it can make them look a lot better, since the Palm's screen is a
  # lot narrower than 80 columns (which is what most <pre> pages aim for).
  # REVISIT - Make this a .site file parameter?
  $txt =~ s/\n\s*(.+[<>].+)\s*\n/<br>\n$1<br>\n/g;
  $txt =~ s/\n\s*([^\n]{1,50})\s*\n/\n$1<br>\n/g;

  $txt =~ s/[ \t]+/ /g;
  $txt;
}

# Work out if we should strip table items based on their size -- well,
# their width at least.
#
sub smart_clean_table {
  my $baseurl = $_[0];
  my $contents = $_[2];
  my $key = $_[3];

  if ($use_table_smarts{$key}) {
    $_ = " $_[1] "; s/\s+/ /g; s/ = /=/g; s/"//g;

    #my $replace = ' ';
    #if ($main::debug) {
      #$replace = "[table item \&lt;td$_\&gt; omitted]\n";
    #}

    if (/ width=(\d+) /i) {
      if ($1+0 < 250) { return ' '; }
    } elsif (/ width=(\d+)% /i) {
      if ($1+0 < 40) { return ' '; }
    }
  }
  $contents;
}

sub translate_link {
  my ($url, $baseurl, $link, $text, $ahref, $posthref) = @_;

  if (!&writing_html) {
    return $text;
  }
  if (!defined $ahref) { $ahref = "<a href="; }
  if (!defined $posthref) { $posthref = ">"; }

  $link = &AbsoluteURL ($url, $link);

  if ($main::nolinkrewrite) {
    return $ahref."\"".$link."\"".$posthref.$text."</a>";
  }

  # translate to printable version first, in case the StoryURL pattern
  # only covers the printable style.
  $link = &make_printable ($baseurl, $link, 0);

  # Is the link one that we will be downloading? If not, just de-linkify
  # it. 1-level sites never have active links so we can just assume
  # the links should not be links.
  if (!match_url ($link, $story_limit_to{$baseurl}) || $levels{$baseurl} < 0)
  {
    # check the contents/issue levels as well.
    my $ok = 0;

    for ($lev = $levels{$baseurl}; $lev >= 0; $lev--) {
      my $key = "$lev $baseurl";
      if (defined $links_limit_to{$key}) {
	if (!match_url ($link, $links_limit_to{$key})) { $ok = 1; last; }
      }
    }

    if ($ok == 0 && $link ne $baseurl) {
      if ($main::warn_about_external_links) {
	&main::dbg ("External link not translated: $link");
      }
      # return $text;		# without <a href=...> </a>
      return "<u>".$text."</u>";
    }
  }
  #&main::dbg ("Translating link: $link");
 
  # Note that we always put in quotes around the URL.
  # &remove_external_links, which is run later, requires this (and anyway
  # it makes for better HTML).
  #
  if ($main::fileperpage) {
    my ($fname, $relative) = &get_page_tmpfile ($link);
    $ahref."\"".$relative."\"".$posthref.$text."</a>";
  } else {
    my $anchor = $link; $anchor =~ s/[^-_A-Za-z0-9]/_/g;
    $ahref."\"#".$anchor."\"".$posthref.$text."</a>";
  }
}

sub html_to_text {
  my $url = shift;
  my $baseurl = shift;
  my $page = shift;
  my $format = shift;

  if ((defined $site_format{$baseurl}) && ($site_format{$baseurl} eq 'rss')) {
    # Convert the RSS formatting into a nice display, for the index page.
    $page =~ s,<channel>(.*?)<title>(.*?)<\/title>(.*?)<\/channel>,<h2>$2<\/h2> $1 $3,gis;

    my $link;
    $page =~ s/<link>(.*?)<\/link>/
      $link = $1; $link =~ s,^.*<url>(.*)<\/url>.*$,$1,g;
      $link = &AbsoluteURL($url,$link);
      '(<a href='.$link.'>'.$link.'<\/a>)';
    /gies;

    $page =~ s,<title>(.*?)<\/title>,<b>$1<\/b> ,gis;
    $page =~ s,<item>,<p>,gis; $page =~ s,<\/item>,<\/p>,gis;

    # the description is converted for RSS 0.91 sites -- the "fat" format
    $page =~ s,<description>(.*?)<\/description>,$1 ,gis;
  }

  # strip tags we know we don't want
  $page =~ s/<form(\s+[^>]+|)>.*?<\/form>/ /gis;
  $page =~ s/<style(\s+[^>]+|)>.*?<\/style>/ /gis;	# not yet
  $page =~ s/<image(\s+[^>]+|)>.*?<\/image>/ /gis;	# RDF tag
  $page =~ s/<channel(\s+[^>]+|)>.*?<\/channel>/ /gis;	# RDF tag
  $page =~ s/<script(\s+[^>]+|)>.*?<\/script>/ /gis;
  $page =~ s/<map(\s+[^>]+|)>.*?<\/map>/ /gis;
  $page =~ s/<head(\s+[^>]+|)>.*?<\/head>/ /gis;
  $page =~ s/<iframe(\s+[^>]+|)>.*?<\/iframe>/ /gis;
  $page =~ s/<ilayer(\s+[^>]+|)>.*?<\/ilayer>/ /gis;
  $page =~ s/<layer(\s+[^>]+|)>.*?<\/layer>/ /gis;
  $page =~ s/<frame(\s+[^>]+|)>/ /gis;
  $page =~ s/<applet(\s+[^>]+|)>.*?<\/applet>/ /gis;
  $page =~ s/<item(\s+[^>]+|)>.*?<\/item>/ /gis;	# some RDF items
  $page =~ s/<link(\s+[^>]+|)>.*?<\/link>/ /gis;	# some RDF items
  $page =~ s/<title(\s+[^>]+|)>.*?<\/title>/ /gis;	# some RDF items
  $page =~ s/<img\s+[^>]*src=[^>]+>/ /gis;		# inline images
  $page =~ s/<!--.*?-->/ /gis;			# MSIE-style comments
  $page =~ s/<!--.*?>/ /gis;			# Netscape-style comments

  # Clean up some mishaps that can occur with diffed pages and
  # multi-line-spanning formatting tags!
  #
  $page =~ s,<(b)>(.*?)<(/b)>,<!!!$1>$2<!!!$3>,gis;
  $page =~ s,<(i)>(.*?)<(/i)>,<!!!$1>$2<!!!$3>,gis;
  $page =~ s,<(strong)>(.*?)<(/strong)>,<!!!$1>$2<!!!$3>,gis;
  $page =~ s,<(small)>(.*?)<(/small)>,<!!!$1>$2<!!!$3>,gis;
  $page =~ s,<(big)>(.*?)<(/big)>,<!!!$1>$2<!!!$3>,gis;
  $page =~ s,<(a\s+[^>]+)>(.*?)<(/a)>,<!!!$1>$2<!!!$3>,gis;
  $page =~ s,<(font\s+[^>]+)>(.*?)<(/font)>,<!!!$1>$2<!!!$3>,gis;
  $page =~ s,<(code(?:\s+[^>]+|))>(.*?)<(/code)>,<!!!$1>$2<!!!$3>,gis;
  $page =~ s,<(pre(?:\s+[^>]+|))>(.*?)<(/pre)>,<!!!$1>$2<!!!$3>,gis;
  $page =~ s,<(h\d(?:\s+[^>]+|))>(.*?)<(/h\d)>,<!!!$1>$2<!!!$3>,gis;

  # any tags left are stray tags with no opposite tag, so strip 'em.
  $page =~
      s,<(|/)(a|b|i|small|big|strong|font|code|pre|h\d)(?:\s+[^>]+|)>, ,gis;
  $page =~ s,<!!!,<,g;		# and bring it back to normal

  # convert <pre> text to proper HTML, it displays better.
  $page =~ s/<pre>(.*?)<\/pre>/&clean_preformatted_text($1);/gies;
  $page =~ s/<code>(.*?)<\/code>/&clean_preformatted_text($1);/gies;

  # strip all existing line breaks, they will just confuse matters
  # when we convert to text or HTML. It's also easier to do proper diffs
  # when we control the placement of newlines.
  $page =~ s/[\r\n]+/ /gs;

  if ($format == $OUT_DOC) {
    # Create DOC bookmarks at <a name> tags
    # From Brian Lalor <blalor@hcirisc.cs.binghamton.edu>
    # via Christopher Heschong's <chris@screwdriver.net>
    # webpage-to-prc converter. Nice one lads, good trick!
    $page =~ s/<a name.*?>/$main::bookmark_char /gis;
  }

  if ($format == $OUT_HTML) {

    # note the conversion of href= to href!!!=. This stops the second
    # substitution from screwing up the output of the first one!
    $page =~ s/(<a\s+[^>]*href)=(?:\"|%22)([^\">]+)(?:\"|%22)([^>]*?>)(.*?)<\/a>/
	    &translate_link ($url, $baseurl, $2, $4, $1.'!!!=', $3);
    	/gies;
    $page =~ s/(<a\s+[^>]*href)=([^>\s\n]+)([^>]*>)(.*?)<\/a>/
	    &translate_link ($url, $baseurl, $2, $4, $1.'!!!=', $3);
    	/gies;
    $page =~ s/href!!!=/href=/gis;	# back to normal

    # This works around a bug (I think) in iSilo that makes Wired News
    # indexes look rotten. Shouldn't be harmful anyway.
    $page =~ s/<br>\s*\&nbsp;\s*<br>/<br><br>/gis;

    # since we're rendering to HTML, line breaks are OK. Put them back in!
    $page =~
    s,(<(br|p|hr|table|td|/td|/table|/p|/tr|/h\d|/div)\s*[^>]*>),$1\n,gis;

    # ...but clean up useless tags and whitespace at the start and end
    # of the text.
    $page =~ s,^(<(br|hr|/td|/table|/p|/tr|/h\d|/div)\s*[^>]*>|\s+),,gis;
    $page =~ s,^(<(br|p|hr|td|table|tr|h\d|div)\s*[^>]*>|\s+),,gis;
  }

  if ($format == $OUT_DOC || $format == $OUT_TEXT) {
    # We're converting to DOC or text format, so we need to do a lot
    # more work here.

    # a sidebar enclosed by a table? separate it from the rest of the text.
    $page =~ s/<\/tr>/\n\n/gis;
    $page =~ s/<\/table>/\n\n/gis;	# end of <table>
    $page =~ s/<\/pre>/\n\n/gis;	# end of <pre> text
    $page =~ s/<(\/h\d|h\d)(\s+[^>]+|)>/\n\n/gis;	# headings
    $page =~ s/<\/?blockquote(\s+[^>]+|)>/\n\n/gis;	# quotes
    $page =~ s/<hr(\s+[^>]+?|)>/\n\n/gis;	# horiz lines
    $page =~ s/<br(\s+[^>]+?|)>/\n/gis;	# end-of-line markers
    $page =~ s/<li(\s+[^>]+?|)>/\n/gis;	# list items

    $page =~ s/<\/?p(\s+[^>]+?|)>/\n\n/gis;
    # don't worry, multiple blank lines are sorted later

    $page =~ s/<\/td>/ /gis;		# end-of-table-item

    1 while ($page =~ s/<[^>]+?>//gs);	# trim all other tags

    # do a few escapes here -- the commonly used ones, as they are most
    # likely to show up capitalised where they shouldn't be.
    # The spec seems confused on this point.
    $page =~ s/\&nbsp;/ /gi;
    $page =~ s/\&amp;/\&/gi;
    $page =~ s/\&quot;/\"/gi;
    $page =~ s/\&copy;/(c)/gi;
    $page =~ s/\&reg;/(r)/gi;
    $page =~ s/\&lt;/</gi;
    $page =~ s/\&gt;/>/gi;
    $page = &remove_html_escapes ($page);	# sort the lot of 'em out

    # trim multiple (blank) bookmarks
    $page =~ s/($main::bookmark_char\s+){2,}/$main::bookmark_char /gs;
  }

  $page =~ s/[ \t]+/ /g;	# canonicalise down to one space
  $page =~ s/\n /\n/gs;		# leading w/s on each line
  $page =~ s/\n{3,}/\n\n/gs;	# too many blank lines
  $page =~ s/^\s+//gs;		# blank space at start of story
  $page =~ s/\s+$//gs;		# blank space at end of story

  $page;
}

sub remove_html_escapes {
  # Convert special HTML characters
  # This code was shamelessly stolen from
  # http://pilot.screwdriver.net/convert.pl.txt , 
  # Christopher Heschong's <chris@screwdriver.net> webpage-to-pilot
  # conversion tool. (whoops, reinvented the wheel again --j. ;)
  #
  # In turn, he credits the following folks:
  # From: "Yannick Bergeron" <bergery@videotron.ca>
  # And Especially:  Sam Denton <Sam.Denton@maryville.com>
  #
  # Cheers lads!

  my $page = shift;
  my %escapes = (
    # first, the big four HTML escapes
    'quot',       '"',    # quote
    'amp',        '&',    # ampersand
    'lt',         '<',    # less than
    'gt',         '>',    # greater than
    # Sam got most of the following HTML 4.0 names from
    # http://spectra.eng.hawaii.edu/~msmith/ASICs/HTML/Style/allChar.htm
    'emsp',       "\x80", # em space (HTML 2.0)
    'sbquo',      "\x82", # single low-9 (bottom) quotation mark (U+201A)
    'fnof',       "\x83", # Florin or Guilder (currency) (U+0192)
    'bdquo',      "\x84", # double low-9 (bottom) quotation mark (U+201E)
    'hellip',     "\x85", # horizontal ellipsis (U+2026)
    'dagger',     "\x86", # dagger (U+2020)
    'Dagger',     "\x87", # double dagger (U+2021)
    'circ',       "\x88", # modifier letter circumflex accent
    'permil',     "\x89", # per mill sign (U+2030)
    'Scaron',     "\x8A", # latin capital letter S with caron (U+0160)
    'lsaquo',     "\x8B", # left single angle quotation mark (U+2039)
    'OElig',      "\x8C", # latin capital ligature OE (U+0152)
    'diams',      "\x8D", # diamond suit (U+2666)
    'clubs',      "\x8E", # club suit (U+2663)
    'hearts',     "\x8F", # heart suit (U+2665)
    'spades',     "\x90", # spade suit (U+2660)
    'lsquo',      "\x91", # left single quotation mark (U+2018)
    'rsquo',      "\x92", # right single quotation mark (U+2019)
    'ldquo',      "\x93", # left double quotation mark (U+201C)
    'rdquo',      "\x94", # right double quotation mark (U+201D)
    'endash',     "\x96", # dash the width of ensp (Lynx)
    'ndash',      "\x96", # dash the width of ensp (HTML 2.0)
    'emdash',     "\x97", # dash the width of emsp (Lynx)
    'mdash',      "\x97", # dash the width of emsp (HTML 2.0)
    'tilde',      "\x98", # small tilde
    'trade',      "\x99", # trademark sign (HTML 2.0)
    'scaron',     "\x9A", # latin small letter s with caron (U+0161)
    'rsaquo',     "\x9B", # right single angle quotation mark (U+203A)
    'oelig',      "\x9C", # latin small ligature oe (U+0153)
    'Yuml',       "\x9F", # latin capital letter Y with diaeresis (U+0178)
    'ensp',       "\xA0", # en space (HTML 2.0)
    'thinsp',     "\xA0", # thin space (Lynx)
   # from this point on, we're all (but 2) HTML 2.0
    'nbsp',       "\xA0", # non breaking space
    'iexcl',      "\xA1", # inverted exclamation mark
    'cent',       "\xA2", # cent (currency)
    'pound',      "\xA3", # pound sterling (currency)
    'curren',     "\xA4", # general currency sign (currency)
    'yen',        "\xA5", # yen (currency)
    'brkbar',     "\xA6", # broken vertical bar (Lynx)
    'brvbar',     "\xA6", # broken vertical bar
    'sect',       "\xA7", # section sign
    'die',        "\xA8", # spacing dieresis (Lynx)
    'uml',        "\xA8", # spacing dieresis
    'copy',       "\xA9", # copyright sign
    'ordf',       "\xAA", # feminine ordinal indicator
    'laquo',      "\xAB", # angle quotation mark, left
    'not',        "\xAC", # negation sign
    'shy',        "\xAD", # soft hyphen
    'reg',        "\xAE", # circled R registered sign
    'hibar',      "\xAF", # spacing macron (Lynx)
    'macr',       "\xAF", # spacing macron
    'deg',        "\xB0", # degree sign
    'plusmn',     "\xB1", # plus-or-minus sign
    'sup2',       "\xB2", # superscript 2
    'sup3',       "\xB3", # superscript 3
    'acute',      "\xB4", # spacing acute
    'micro',      "\xB5", # micro sign
    'para',       "\xB6", # paragraph sign
    'middot',     "\xB7", # middle dot
    'cedil',      "\xB8", # spacing cedilla
    'sup1',       "\xB9", # superscript 1
    'ordm',       "\xBA", # masculine ordinal indicator
    'raquo',      "\xBB", # angle quotation mark, right
    'frac14',     "\xBC", # fraction 1/4
    'frac12',     "\xBD", # fraction 1/2
    'frac34',     "\xBE", # fraction 3/4
    'iquest',     "\xBF", # inverted question mark
    'Agrave',     "\xC0", # capital A, grave accent
    'Aacute',     "\xC1", # capital A, acute accent
    'Acirc',      "\xC2", # capital A, circumflex accent
    'Atilde',     "\xC3", # capital A, tilde
    'Auml',       "\xC4", # capital A, dieresis or umlaut mark
    'Aring',      "\xC5", # capital A, ring
    'AElig',      "\xC6", # capital AE diphthong (ligature)
    'Ccedil',     "\xC7", # capital C, cedilla
    'Egrave',     "\xC8", # capital E, grave accent
    'Eacute',     "\xC9", # capital E, acute accent
    'Ecirc',      "\xCA", # capital E, circumflex accent
    'Euml',       "\xCB", # capital E, dieresis or umlaut mark
    'Igrave',     "\xCC", # capital I, grave accent
    'Iacute',     "\xCD", # capital I, acute accent
    'Icirc',      "\xCE", # capital I, circumflex accent
    'Iuml',       "\xCF", # capital I, dieresis or umlaut mark
    'Dstrok',     "\xD0", # capital Eth, Icelandic (Lynx)
    'ETH',        "\xD0", # capital Eth, Icelandic
    'Ntilde',     "\xD1", # capital N, tilde
    'Ograve',     "\xD2", # capital O, grave accent
    'Oacute',     "\xD3", # capital O, acute accent
    'Ocirc',      "\xD4", # capital O, circumflex accent
    'Otilde',     "\xD5", # capital O, tilde
    'Ouml',       "\xD6", # capital O, dieresis or umlaut mark
    'times',      "\xD7", # multiplication sign
    'Oslash',     "\xD8", # capital O, slash
    'Ugrave',     "\xD9", # capital U, grave accent
    'Uacute',     "\xDA", # capital U, acute accent
    'Ucirc',      "\xDB", # capital U, circumflex accent
    'Uuml',       "\xDC", # capital U, dieresis or umlaut mark
    'Yacute',     "\xDD", # capital Y, acute accent
    'THORN',      "\xDE", # capital THORN, Icelandic
    'szlig',      "\xDF", # small sharp s, German (sz ligature)
    'agrave',     "\xE0", # small a, grave accent
    'aacute',     "\xE1", # small a, acute accent
    'acirc',      "\xE2", # small a, circumflex accent
    'atilde',     "\xE3", # small a, tilde
    'auml',       "\xE4", # small a, dieresis or umlaut mark
    'aring',      "\xE5", # small a, ring
    'aelig',      "\xE6", # small ae diphthong (ligature)
    'ccedil',     "\xE7", # small c, cedilla
    'egrave',     "\xE8", # small e, grave accent
    'eacute',     "\xE9", # small e, acute accent
    'ecirc',      "\xEA", # small e, circumflex accent
    'euml',       "\xEB", # small e, dieresis or umlaut mark
    'igrave',     "\xEC", # small i, grave accent
    'iacute',     "\xED", # small i, acute accent
    'icirc',      "\xEE", # small i, circumflex accent
    'iuml',       "\xEF", # small i, dieresis or umlaut mark
    'dstrok',     "\xF0", # small eth, Icelandic (Lynx)
    'eth',        "\xF0", # small eth, Icelandic
    'ntilde',     "\xF1", # small n, tilde
    'ograve',     "\xF2", # small o, grave accent
    'oacute',     "\xF3", # small o, acute accent
    'ocirc',      "\xF4", # small o, circumflex accent
    'otilde',     "\xF5", # small o, tilde
    'ouml',       "\xF6", # small o, dieresis or umlaut mark
    'divide',     "\xF7", # division sign
    'oslash',     "\xF8", # small o, slash
    'ugrave',     "\xF9", # small u, grave accent
    'uacute',     "\xFA", # small u, acute accent
    'ucirc',      "\xFB", # small u, circumflex accent
    'uuml',       "\xFC", # small u, dieresis or umlaut mark
    'yacute',     "\xFD", # small y, acute accent
    'thorn',      "\xFE", # small thorn, Icelandic
    'yuml',       "\xFF", # small y, dieresis or umlaut mark
  );
  foreach $_ (32..126, 160..255) { $escapes{'#'.$_} = pack('c',$_); }
 
  $page =~ s/&((\w*)|(\#\d+));/$escapes{$1} || $&/egi;
  $page;
}

# ---------------------------------------------------------------------------

sub check_for_oldest {
  my $url = shift;
  $url =~ m,http://(\S+?)/,; my $urlhost = $1;

  #&main::dbg ("checking to see if $url is oldest at its site: modtime=".
  	#(defined $last_modtime{$url} ? $last_modtime{$url} : "unknown)"));
  if (defined $last_modtime{$url}) {
    if (defined($main::oldest_already_seen_this_site{$urlhost})
    	? $main::oldest_already_seen_this_site{$urlhost} > $last_modtime{$url} : 1)
    {
      &main::dbg ("oldest link seen at $urlhost $url: modtime=".$last_modtime{$url});
      $main::oldest_already_seen_this_site{$urlhost} = $last_modtime{$url};
    }
  }
}

# ---------------------------------------------------------------------------

sub cachefilename {
  my $url = shift;

  my $cachefile;
  undef $cachefile;
  if (defined $cachedir) {
    $cachefile = $url;
    $cachefile =~ s/[^-_A-Za-z0-9]/_/g;
    $cachefile = $cachedir.$slash.$cachefile;
  }
}

sub sharedcachefilename {
  my $url = shift;

  my $cachefile;
  undef $cachefile;
  if (defined $sharedcache) {
    $cachefile = $url;
    $cachefile =~ s/[^-_A-Za-z0-9]/_/g;
    $cachefile = $sharedcache.$slash.$cachefile;
  }
}

sub newcachefilename {
  my $url = shift;

  my $cachefile;
  undef $cachefile;
  if (defined $newcachedir) {
    $cachefile = $url;
    $cachefile =~ s/[^-_A-Za-z0-9]/_/g;
    $cachefile = $newcachedir.$slash.$cachefile;
  }
}

sub get_cached_page {
  my $url = shift;
  my $is_diff_page = shift;
  my $cachefile = &cachefilename ($url);

  if (!defined $cachefile) { return undef; }

  # if -refresh is on, do not return any cached pages.
  if ($main::refresh) { return undef; }

  if (open (IN, "< $cachefile")) {
    binmode IN; my $cachedpage = join ('', <IN>); close IN; $cachedpage;
  } else {
    undef;
  }
}
sub get_cached_page_for_diff { &get_cached_page (@_, 1); }

sub get_page {
  my $url = shift;
  my $is_dynamic_html = shift;
  my $page = '';

  my $cachefile = &cachefilename ($url);
  my $cachedpage = &get_cached_page ($url);
  &check_for_oldest ($url);
  my $lastmod;

  if (defined $cachefile && defined $cachedpage) {
    if ($is_dynamic_html == 0) {
      &main::dbg("cached version exists");
      return $cachedpage;

    } elsif (defined (-M $cachefile)
    	&& -M _ < $main::cached_front_page_lifetime
	&& -M _ > 0)		# just make sure the clock is sane
    {
      &main::dbg("cached version is new enough: ".(-M $cachefile)." days");
      return $cachedpage;

    } elsif ($main::use_only_cache) {
      &main::dbg("-fromcache switch is on, using cached version");
      return $cachedpage;
    }
  }

  # see if we have it in the shared cache
  if (defined $sharedcache) {
    $cachedpage = undef;
    $cachefile = &sharedcachefilename ($url);
    if (defined $cachefile && (open (IN, "< $cachefile"))) {
      binmode IN; $cachedpage = join ("", <IN>); close IN;
      if ($cachedpage =~ s/^<!-- lastmod: (\d+) -->//) {
	$lastmod = $1+0;
      }
    }

    if (defined $cachefile && defined $cachedpage) {
      if ($is_dynamic_html == 0) {
	&main::dbg("shared-cache version exists");
	if (defined $lastmod)
	      { $last_modtime{$url} = $lastmod; &check_for_oldest ($url); }
	return $cachedpage;

      } elsif (defined (-M $cachefile)
	  && -M _ < $main::cached_front_page_lifetime && -M _ > 0)
      {
	&main::dbg("shared-cache version is new enough: ".(-M $cachefile)." days");
	if (defined $lastmod)
	      { $last_modtime{$url} = $lastmod; &check_for_oldest ($url); }
	return $cachedpage;

      } elsif ($main::use_only_cache) {
	&main::dbg("-fromcache switch is on, using shared-cache version");
	if (defined $lastmod)
	      { $last_modtime{$url} = $lastmod; &check_for_oldest ($url); }
	return $cachedpage;
      }
    }

    undef $cachedpage;	# if it didn't pass those tests, don't keep it!
  }

  if ($main::use_only_cache) {
    &main::dbg("-fromcache switch is on, not doing HTTP request");
    return undef;
  }

  if ($got_intr_flag) { return undef; }
  use HTTP::Request::Common;
  my $req = new HTTP::Request ('GET', $url);	# REVISIT - support POST
  $req->header ("Accept-Language" => "en",
	"Accept-Charset" => "iso-8859-1,*,utf-8");

  my $resp = undef;
  $cmd = '$resp = $main::useragent->request ($req);';

  my $timeout = 10;	# minutes

  # REVISIT -- implement timeout for Win32 perl
  if (&Portability::MyOS eq 'UNIX') {
    eval '
      local $SIG{"ALRM"} = sub { die "alarm\n" };
      alarm $timeout*60; { ' . $cmd. ' } alarm 0;
    ';
  } else {
    eval $cmd;
  }

  die if $@ && $@ ne "alarm\n"; if ($@) {
    &sitewarn ("HTTP GET timed out, $timeout minutes without a response.");
    &got_intr;
  }
  if ($got_intr_flag) { return undef; }

  if (!$resp->is_success) {
    &sitewarn ("HTTP GET failed: ".$resp->status_line." ($url)");
    return undef;
  }
  $page = $resp->content;

  # REVISIT - use $resp->base as new base url?

  if (defined $resp->last_modified) {
    # this seems to be obsolete -- last-modified dates are now being
    # returned as UNIX time_t's. Nice one!
    #
    #use HTTP::Date;
    #$lastmod = undef;
    # protect against a nasty die in Time::Local::timegm().
    #my $x = $SIG{__DIE__}; $SIG{__DIE__} = 'warn';
    #eval {
      #$lastmod = str2time ($resp->last_modified);
    #};
    #$SIG{__DIE__} = $x;		# a bit absurd all that, really

    $lastmod = $resp->last_modified;
    &main::dbg ("last-modified time for $url: $lastmod (".&time2datestr($lastmod).")");

    if (defined $last_modtime{$url} && defined($lastmod)
      && $lastmod <= $last_modtime{$url} && !$main::refresh)
    {
      &verbose ("Skipping (no mod since last download): $url");
      $last_modtime{$url} = $lastmod+0; &check_for_oldest ($url);
      return undef;
    }

  } else {
    &main::dbg ("last-modified time for $url: not provided");
    $lastmod = time;
  }
  $last_modtime{$url} = $lastmod; &check_for_oldest ($url);

  if ($is_dynamic_html && defined $cachedpage && $cachedpage eq $page
    	&& !$main::refresh)
  {
    &verbose ("Skipping (HTML has not changed): $url");
    return undef;
  }
  $page;
}

# ---------------------------------------------------------------------------
 
sub cache_page {
  my ($url, $page, $cachelater) = @_;

  my $cachefile = &newcachefilename ($url);

  # if this page is the latest version of a diffed page, don't cache it
  # immediately, as it will mean lost stories if we're interrupted.
  # Instead save the filename for renaming when the run finishes.
  if (defined $cachelater && $cachelater == 1) {
    my $tmpname = $cachefile.'.tmp';
    $main::caches_to_rename{$tmpname} = $cachefile;
    $cachefile = $tmpname;
  }
  open (C1OUT, "> $cachefile"); binmode C1OUT; print C1OUT $page; close C1OUT;

  if (defined $sharedcache) {
    $cachefile = &sharedcachefilename ($url);
    open (C2OUT, "> $cachefile"); binmode C2OUT;
    if (defined $last_modtime {$url}) {
      # cache the last-modified time of this page as well.
      print C2OUT "<!-- lastmod: ",$last_modtime{$url}," -->\n";
    }
    print C2OUT $page; close C2OUT;
  }
  $page;
}
 
sub cache_page_later { &cache_page ($_[0], $_[1], 1); }

# ---------------------------------------------------------------------------

sub write_as_story {
  local ($_);
  my ($is_front, $url, $baseurl, $page, $headline) = @_;

  my $sitename = $name{$baseurl};
  if (defined $story_postproc{$baseurl}) {
    my $bookmark_char = $main::bookmark_char;	# convenience for PostProc
    $_ = $page;
    if (!eval $story_postproc{$baseurl}."; 1;") {
      &sitewarn("StoryPostProc failed: $@");
      # and keep the original $page
    } else {
      $page = $_;
    }
  }

  my $outtext = '';
  my $one_page_anchor;

  if (&writing_html) {
    $one_page_anchor = $url; $one_page_anchor =~ s/[^-_A-Za-z0-9]/_/g;
    $outtext .=
	"<hr><i>$sitename: $url</i><br><a name=\"$one_page_anchor\">\n"
	. "[<a href=\"__SITESCOOPER_STORY_"
	.      ($main::current_story_index-1)."\">&lt;&lt;</a>]"
	. "[<a href=\"__SITESCOOPER_STORY_"
	.      ($main::current_story_index+1)."\">&gt;&gt;</a>]<br>"
	. $page;

  } else {
    $outtext .= "------------\n$sitename: $url\n\n";
    if (&writing_doc) {
      if (defined $headline) {
	&verbose ("(Headline: $headline)");
	$outtext .= "$main::bookmark_char $headline\n";
      } else {
	# use the first line in the story instead
	$outtext .= "$main::bookmark_char ";
      }
    }

    foreach $_ (split (/\n/, $page)) {
      if (&writing_text) {
	# wrap each line after 70 columns
	while (s/^(.{70}\S*)\s+//) {
	  $outtext .= $1."\n";
	}
      }
      $outtext .= $_."\n";
    }

    $outtext .= "\n\n\n";
  }

  if ($main::fileperpage) {
    if ($is_front) {
      # this is the front page, just append it to the index file
      $main::output_file .= $outtext;

    } else {
      my ($fname, $relative) = &get_page_tmpfile ($url);
      open (PAGEFILE, ">> $fname") or die "failed to write to $fname\n";
      if (&writing_html) {
	print PAGEFILE "<html><head></head><body>",
			  $outtext, "</body></html>";

	# ensure we have links to the stories from the index page!
	if ($main::output_file !~ /href=\"?$relative/) {
	  if (!defined $headline) { $headline = '(no headline found)'; }
	  $main::output_file .= "<a href=$relative>$headline</a><br>\n";
	}
	$main::output_links_snarfed{$relative} = 1;
	$main::output_story_urls[$main::current_story_index] = $relative;

      } else {
	print PAGEFILE $outtext;
      }
      close PAGEFILE;
    }

  } else {
    $main::output_file .= $outtext;
    $main::output_links_snarfed{'#'.$one_page_anchor} = 1;
    $main::output_story_urls[$main::current_story_index] =
    					'#'.$one_page_anchor;
  }

  $main::current_story_index++;
  $main::output_links_snarfed{$url} = 1;

  $file_size += length($outtext);
  &dbg ("story written, ". ($file_size/1024)." Kb, limit is ".
  			$main::filesizelimit." Kb");

  if ($file_size/1024 >= $main::filesizelimit) {
    $hit_file_size_limit = 1;
  }
  $stories_found++;
}

# ---------------------------------------------------------------------------

sub get_page_tmpfile {
  local ($_);
  my $url = shift;

  if (!defined $page_to_tmpfile{$url}) {
    $_ = $url;
    s/[^-_A-Za-z0-9]+/_/g;
    $page_to_tmpfile{$url} = $tmpfile.$slash.$_;
    while (-r $page_to_tmpfile{$url}) {
      $page_to_tmpfile{$url} .= (time % 10);
    }
    $page_to_tmpfile{$url} .= '.html';
  }

  &main::dbg ("page file for $url: $page_to_tmpfile{$url}");
  $page_to_tmpfile{$url} =~ /[\\\/\:]([-_\.A-Za-z0-9]+)$/;
  ($page_to_tmpfile{$url}, $1);
}

sub clear_page_tmpfiles {
  %page_to_tmpfile = ();
}

# ---------------------------------------------------------------------------

sub warn_log {
  my $msg = join ('', @_); chomp $msg;
  &log ("Warning: ", $msg);
  print STDERR @_;
}

sub die_log {
  my $msg = join ('', @_); chomp $msg;
  &log ("Fatal: ", $msg);
  print STDERR @_; &cleanexit(2);
}

sub log {
  if (defined fileno LOGFILE) { print LOGFILE @_, "\n"; }
}

sub journal {
  if (defined fileno JOURNAL) {
    my $tag = shift;
    my $lines = join("", @_); $lines =~ s/^/$tag:\t/gm;
    print JOURNAL $lines, "\n";
  }
}

sub dbg {
  if ($main::debug != 0) {
    my $msg = "debug: ".join ('', @_); chomp $msg; &log ($msg);
    print STDERR "debug: ",@_,"\n";
  }
}

sub sitewarn {
  &sitewarn_file_line ($sitewarn_current_site_line, @_);
}

sub sitewarn_file_line {
  my $fname = shift;
  $fname =~ s,^.*[\/\\:]([^\/\\:]+?):\d+?$,$1,o;
  warn "Site \"$fname\": ".join('', @_)."\n";
}

sub verbose {
  if ($main::verbose) {
    my $msg = join ('', @_); chomp $msg; &log ($msg);
    print STDERR @_,"\n";
  }
}

sub cleanexit {
  $SIG{__WARN__} = '';
  $SIG{__DIE__} = '';
  exit @_;
}

sub AbsoluteURL {
  local ($baseurl, $_) = @_;
  s/^"//; s/"$//;		# trim quotes if necessary
  s/^%22//; s/%22.*?$//;	# trim escaped quotes (!!)

  s/#.*$//g;			# we can't get bits of docs (yet) TODO 

  if (/^[^\/]+:/) {
    if (!/^(http|file):/) {
      # non-HTTP urls get ignored; don't get URI::URL involved, it'll crash
      return $_;
    }
  }

  use URI::URL;
  my $url = new URI::URL ($_, $baseurl);
  $url->abs->as_string;
}

sub AddHostToURL {
  # a simpler form of AbsoluteURL, used for StoryURL lines.
  # this is necessary because the real thing will escape metacharacters
  # which screws up regexp patterns.

  local ($baseurl, $_) = @_;
  s/^"//; s/"$//;	# trim quotes if necessary
  s/#.*$//g;		# we can't get bits of docs (yet)
  $_ = &expand_url_magic ($_);	# allow [[MM]] etc. keywords in these patterns

  if (m,^[^/]+://,) {
    # do nothing, it's fully-qualified
  } elsif (m,^/,) {
    $baseurl =~ m,^([^/]+://[^/]+)/, and ($_ = $1.$_);
  }
  $_;
}

sub mm_to_monthname {
  my @months = qw(x Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
  $months[$_[0]];
}

sub get_date {
  my $time = shift;
  my ($x, $mday, $mon, $year);
  ($x,$x,$x,$mday,$mon,$year,$x,$x,$x) =
  		localtime(defined $time ? $time : time);
  $mon++; $year += 1900;
  ($mday, $mon, $year, &mm_to_monthname($mon));
}

sub time2datestr {
  my $time = shift;
  my ($dd, $mm, $year, $mon) = &get_date ($time);
  "$mon $dd $year";
}

sub match_url {
  my $url = shift;
  my $pat = &expand_url_magic (shift);
  ($url =~ m#^${pat}$#);
}

sub expand_url_magic {
  local ($_);
  my $url = shift;
  my ($match_url_dd, $match_url_mm, $match_url_yyyy, $match_url_Mstr);

  if (!defined $match_url_yyyy) {
    ($match_url_dd, $match_url_mm, $match_url_yyyy, $match_url_Mstr)
    				= &get_date;
    $match_url_yy = $match_url_yyyy; $match_url_yy =~ s/^\d\d//; # trim century
    $match_url_mm = "0$match_url_mm" unless ($match_url_mm =~ /^..$/);
    $match_url_dd = "0$match_url_dd" unless ($match_url_dd =~ /^..$/);
  }

  $url =~ s/\[\[YYYY\]\]/${match_url_yyyy}/g;
  $url =~ s/\[\[YY\]\]/${match_url_yy}/g;

  $url =~ s{\[\[MM([\+\-]\d+|)\]\]}{
    &offset_month($match_url_mm, $1);
  }ge;
  $url =~ s{\[\[Mon([\+\-]\d+|)\]\]}{
    &mm_to_monthname (&offset_month($match_url_mm, $1));
  }ge;
  $url =~ s{\[\[mon([\+\-]\d+|)\]\]}{
    $_ = &mm_to_monthname (&offset_month($match_url_mm, $1));
    tr/A-Z/a-z/; $_;
  }ge;
  $url =~ s{\[\[MON([\+\-]\d+|)\]\]}{
    $_ = &mm_to_monthname (&offset_month($match_url_mm, $1));
    tr/a-z/A-Z/; $_;
  }ge;

  $url =~ s/\[\[DD\]\]/${match_url_dd}/g;
  $url;
}

sub offset_month {
  my $mm = shift;
  my $offset = shift;
  if ($offset ne '') { $mm += $offset; }
  if ($mm < 1 || $mm > 12) { $mm = ((($mm-1)+12) % 12)+1; }
  $mm = "0$mm" unless ($mm =~ /^..$/);
  $mm;
}

sub writing_doc {
  ($main::outstyle == $main::OUT_DOC);
}

sub writing_html {
  ($main::outstyle == $main::OUT_HTML);
}

sub writing_text {
  ($main::outstyle == $main::OUT_TEXT);
}

#===========================================================================

package Portability;

sub MyOS {
  if (defined ($Portability::MY_OS)) { return $Portability::MY_OS; }

  # FIGURE OUT THE OS WE'RE RUNNING UNDER
  # Some systems support the $^O variable.  If not available then require()
  # the Config library.  [nicked from CGI.pm -- jmason]

  my $os;
  unless ($os) {
    unless ($os = $^O) {
      require Config;
      $os = $Config::Config{'osname'};
    }
  }

  if ($os=~/win/i) {
    $os = 'Win32';
  } elsif ($os=~/vms/i) {
    $os = 'VMS';
  } elsif ($os=~/mac/i) {
    $os = 'Mac';
  } elsif ($os=~/os2/i) {
    $os = 'OS2';
  } else {
    $os = 'UNIX';
  }
  $Portability::MY_OS = $os;
}

1;

#---------------------------------------------------------------------------

package ScoopHTTP::UserAgent;
use LWP::UserAgent;

BEGIN {
  @ISA = qw(LWP::UserAgent);
  @ScoopHTTP::UserAgent::PasswdMask =
  	unpack ("c*", "Ish0ulDReallY#BeDoING|05th1S>wiTh".
			"5omEThInG+STr0NgeR1kNoW}iKNOw!~");
}

sub new {
  my($class) = @_;
  my $self = new LWP::UserAgent;
  $self = bless $self, $class;
  $self;
}

sub get_basic_credentials {
  my ($self, $realm, $uri, $proxy) = @_;

  if (defined $site_logins{$realm} && defined $site_passes{$realm}) {
    &main::verbose ("(using already-set password for $uri $realm)");

  } else {
    warn ("Need a password to access $uri $realm.\n");
    if ($main::cgimode || !-t) { return undef; }

    print STDERR ("Username: ");
    my $user = <STDIN>; chop $user;

    print STDERR ("Password: ");
    (&Portability::MyOS eq 'UNIX') and system ("stty -echo");
    my $pass = <STDIN>; chop $pass;
    (&Portability::MyOS eq 'UNIX') and system ("stty echo"); print STDERR "\n";

    $site_logins{$realm} = $user;
    $site_passes{$realm} = $pass;
  }

  ($site_logins{$realm}, $site_passes{$realm});
}

sub load_logins {
  if (defined %site_logins) { return %site_logins; }

  %site_logins = ();
  %site_passes = ();
  open (IN, '< '.$main::user_tmpdir.$main::slash.'site_logins') or return undef;

  #$site_logins{'tst'} = $site_passes{'tst'} = "jmason"; &save_logins;

  while (<IN>) {
    s/[\r\n]+$//g;
    my ($ver, $user, $pass, $realm) = split (/###/);
    if (defined $realm && $ver+0 == 0) {
      $site_logins{$realm} = $user;

      my @mask = @ScoopHTTP::UserAgent::PasswdMask;
      my @input = split (' ', $pass);
      my $pass_open = '';
      my $i = 0;

      foreach $_ (@input) {
	my $ch = (($_ ^ $mask[$i++ % $#mask]) ^ 0xaa);
	last if ($ch == 0);
	$pass_open .= sprintf ("%c", $ch);
      }

      $site_passes{$realm} = $pass_open;
    }
  }
  close IN;

  #print "[", $site_logins{'tst'}, "][", $site_passes{'tst'}, "]\n"; exit;
}

sub save_logins {
  if (!defined %site_logins) { return; }
  my $towrite = '';

  foreach $realm (sort keys %site_logins) {
    my @mask = @ScoopHTTP::UserAgent::PasswdMask;
    my @input = (unpack ("c*", $site_passes{$realm}));
    my $pass_disguised = '';
    my $i = 0;

    foreach $_ (@input) {
      $pass_disguised .= (($_ ^ 0xaa) ^ $mask[$i++ % $#mask]) . " ";
    }
    while ($i < int(($#input / 16) + 1) * 16) {
      $pass_disguised .= ((0 ^ 0xaa) ^ $mask[$i++ % $#mask]) . " ";
    }
    chop $pass_disguised;

    $towrite .= "0###". $site_logins{$realm}. "###". $pass_disguised.
    		"###". $realm. "\n";
  }

  # again, all at once to minimise contention
  open (OUT, '> '.$main::user_tmpdir.$main::slash.'site_logins') or
  	(warn ("failed to write to site_logins file!\n"), return);
  print OUT $towrite;
  close OUT or warn ("failed to write to site_logins file!\n");
}

1;

#---------------------------------------------------------------------------
# 
#CGI package ScoopCGI;
#CGI 
#CGI $cgi_cookie = undef;
#CGI 
#CGI sub set_cookie {
#CGI   my ($userid) = @_;
#CGI   $cgi_cookie = $main::cgi->cookie(-name=>'sitescooper', -value=>"$userid");
#CGI   print $main::cgi->header(-cookie=>$cgi_cookie);
#CGI }
#CGI 
#CGI sub get_cookie {
#CGI   my $cookie = $main::cgi->cookie('sitescooper');
#CGI   return unless defined ($cookie);
#CGI 
#CGI   my ($uid, $x) = split ('#', $cookie);
#CGI   ($uid =~ /(\d+)/) and ($main::userid = $1);
#CGI }
#CGI 
#CGI sub print_input_form {
#CGI   # REVISIT
#CGI }
#CGI 
#CGI sub print_results_links {
#CGI   # REVISIT
#CGI }
#CGI 
#CGI sub get_prc_file {
#CGI   # REVISIT
#CGI }

1;

# TODO:
#
# incorporate support for mjd's diff implementation
# bundle LWP modules in sitescooper.tar.gz
# allow more flexibility for filenames: by day-of-week, with date at end
# finish CGI support
# CGI: finish cookie userid support -- passwords
#
#---------------------------------------------------------------------------
# vim:sw=2:tw=74: