#!/usr/bin/perl -w # Author: Rob Park # License: GNU General Public License # Modified by Anderson Lizardo # 2004-12-29: changed encoding to ISO-8859-1 # 2004-12-12: added BUGS section # 2004-03-09: fixed news item URL creation use XML::RSS; use Getopt::Long; use Pod::Usage; use strict; # make a new XML::RSS object my $RSS = new XML::RSS (version => '2.0', encoding => 'ISO-8859-1'); # default arguments for commandline switches my %args = ( "help" => undef, "man" => undef, "news-file" => "news.html", "rss-file" => "feed.rss" ); # parse commandline options, display help if needed GetOptions(\%args, 'help|?', 'man', 'news-file=s', 'rss-file=s'); pod2usage(1) if $args{help}; pod2usage(-exitstatus => 0, -verbose => 2) if ($args{man}); my @items; my %channel = ( link => "http://www.linuxfromscratch.org/", language => "en-us" ); sub sanitize { my $string = shift; $string =~ s/&(?!\w+;)/&/g; $string =~ s//>/g; return $string; } # open the HTML file and tell the user what's going on. #print "Opening $args{'news-file'} for parsing...\n"; open HTML, "<$args{'news-file'}"; # parse the HTML file while () { # figure out what page we're working on m// && ($channel{link} .= "$1/$args{'news-file'}"); # extract the title of the page m/(.+)<\/title>/ && ($channel{title} = sanitize($1)); # extract the description from the meta tags m/<meta name="description" content="(.+)"\s?\/>/ && ($channel{description} = sanitize($1)); # extract the news items if (m/<h3 id="([^"]+)"><a[^>]+>(.+)<\/a><\/h3>/) { my %item; $item{link} = "$channel{link}#$1"; $item{title} = sanitize($2); # ugly way of getting $channel{link} without $args{'news-file'} on the end of it my $cwdir = $channel{link}; { local $/ = $args{'news-file'}; chomp $cwdir; } # skip two lines down, to the first <p> tag $item{description} = <HTML>; $item{description} = <HTML>; $item{description} =~ s/^.*<p>(.+)<\/p>.*$/sanitize($1)/e; $item{description} =~ s/(href|src)="((?!http|ftp|mailto)[^"]+)"/$1="$cwdir$2"/g; chomp $item{description}; # add the newly parsed news item to the list of news items push @items, \%item; } } # insert the channel information into the RSS object $RSS->channel(%channel); # insert the news items into the RSS object for my $item (@items) { $RSS->add_item(%{$item}); } # save the RSS to a file and tell the user what's going on. #print "Saving RSS feed to $args{'rss-file'}...\n"; $RSS->save($args{'rss-file'}); __END__ =head1 NAME lfs2rss.pl - parse the LFS website and convert it into an RSS feed =head1 SYNOPSIS lfs2rss.pl [options] =head1 OPTIONS =over 8 =item B<-h, --help> Print this help message. =item B<-m, --man> Output more verbose help in the form of a man page. =item B<-n, --news-file> Define the location of the HTML file to parse. Defaults to C<./news.html>. =item B<-r, --rss-file> Define the location of the RSS file out output. Defaults to C<./feed.rss>. =back =head1 BUGS Due to a inherent limitation of the lfs2rss.pl script, news items should not have newlines inside <p>...</p> tags (at least not in the first paragraph). Such limitation can be avoided by using a XML/HTML parser instead of reading the HTML file line by line. =head1 REPORTING BUGS Report bugs to <rbpark@ualberta.ca>. =cut