#!/usr/bin/perl -w
# Author: Rob Park <rbpark@ualberta.ca>
# License: GNU General Public License

# Modified by Anderson Lizardo <lizardo@linuxfromscratch.org>
# 2004-12-29: changed encoding to ISO-8859-1
# 2004-12-12: added BUGS section
# 2004-03-09: fixed news item URL creation

use XML::RSS;
use Getopt::Long;
use Pod::Usage;
use strict;

# make a new XML::RSS object
my $RSS = new XML::RSS (version => '2.0', encoding => 'ISO-8859-1');

# default arguments for commandline switches
my %args = 
( 
	"help" => undef,
	"man" => undef,
	"news-file" => "news.html",
	"rss-file" => "feed.rss"
);

# parse commandline options, display help if needed
GetOptions(\%args, 'help|?', 'man', 'news-file=s', 'rss-file=s');
pod2usage(1) if $args{help};
pod2usage(-exitstatus => 0, -verbose => 2) if ($args{man});

my @items;
my %channel = 
(
	link => "http://www.linuxfromscratch.org/",
	language => "en-us"
);

sub sanitize
{
	my $string = shift;
	$string =~ s/&(?!\w+;)/&amp;/g;
	$string =~ s/</&lt;/g;
	$string =~ s/>/&gt;/g;
	return $string;
}

# open the HTML file and tell the user what's going on.
#print "Opening $args{'news-file'} for parsing...\n";
open HTML, "<$args{'news-file'}";

# parse the HTML file
while (<HTML>)
{
	# figure out what page we're working on
	m/<body.*class="(.+)">/
		&& ($channel{link} .= "$1/$args{'news-file'}");

	# extract the title of the page
	m/<title>(.+)<\/title>/ 
		&& ($channel{title} = sanitize($1));
	
	# extract the description from the meta tags
	m/<meta name="description" content="(.+)"\s?\/>/ 
		&& ($channel{description} = sanitize($1));
	
	# extract the news items
	if (m/<h3 id="([^"]+)"><a[^>]+>(.+)<\/a><\/h3>/)
	{
		my %item;
		$item{link} = "$channel{link}#$1";
		$item{title} = sanitize($2);
		
		# ugly way of getting $channel{link} without $args{'news-file'} on the end of it
		my $cwdir = $channel{link};
		{ local $/ = $args{'news-file'}; chomp $cwdir; }
	
		# skip two lines down, to the first <p> tag
		$item{description} = <HTML>;
		$item{description} = <HTML>; 
		$item{description} =~ s/^.*<p>(.+)<\/p>.*$/sanitize($1)/e;
		$item{description} =~ s/(href|src)="((?!http|ftp|mailto)[^"]+)"/$1="$cwdir$2"/g;
		chomp $item{description};

		# add the newly parsed news item to the list of news items
		push @items, \%item;
	}
}

# insert the channel information into the RSS object
$RSS->channel(%channel);

# insert the news items into the RSS object
for my $item (@items)
{
	$RSS->add_item(%{$item});
}

# save the RSS to a file and tell the user what's going on.
#print "Saving RSS feed to $args{'rss-file'}...\n";
$RSS->save($args{'rss-file'});

__END__

=head1 NAME

lfs2rss.pl - parse the LFS website and convert it into an RSS feed

=head1 SYNOPSIS

lfs2rss.pl [options]

=head1 OPTIONS

=over 8

=item B<-h, --help>

Print this help message.

=item B<-m, --man>

Output more verbose help in the form of a man page.

=item B<-n, --news-file>

Define the location of the HTML file to parse. Defaults to C<./news.html>.

=item B<-r, --rss-file>

Define the location of the RSS file out output. Defaults to C<./feed.rss>.

=back

=head1 BUGS

Due to a inherent limitation of the lfs2rss.pl script, news items should not
have newlines inside <p>...</p> tags (at least not in the first paragraph).
Such limitation can be avoided by using a XML/HTML parser instead of reading
the HTML file line by line.

=head1 REPORTING BUGS

Report bugs to <rbpark@ualberta.ca>.

=cut
