#!/usr/bin/perl # manage_news.pl - Script for news management: archiving, last news publishing # Copyright (C) 2003 Anderson Lizardo # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA use strict; use warnings; use File::Path qw(mkpath); use POSIX qw(strftime); use Getopt::Long; use MIME::Parser; use HTML::Entities qw(encode_entities); use HTML::Parser; use URI; use Pod::Usage; ####################### # Global configuration # LFS main URL. All URLs are made relative to this URL, so do not change it # unless the LFS URL has changed my $lfs_url = "http://www.linuxfromscratch.org/"; # Use relative URLs? my $relative_urls = 1; # Publishing my $show_content = 2; # How many items to show fully my $show_link = 5; # How many items to show as links my $summary_length = 100; # Summary text length (used on "text=" attribute) # End of global configuration ############################## my $help = 0; my $man = 0; my $infile = ""; my $archive_under = ""; my $top = ""; my $bottom = ""; GetOptions( "help" => \$help, "man" => \$man, "infile|i=s" => \$infile, "archive-under|a=s" => \$archive_under, "top|t=s" => \$top, "bottom|b=s" => \$bottom, ) or pod2usage(1); pod2usage(1) if $help; pod2usage(-exitstatus => 0, -verbose => 2) if $man; my $parser = new MIME::Parser; # These options disable the use of temporary files and use "in-core" # (memory-only) structures instead. $parser->output_to_core(1); $parser->tmp_to_core(1); $parser->use_inner_files(1); # Text buffer, used to store parsed HTML data my $buffer = ""; my $mime_entity; my $input_name; # Read input file, given with -i option if ($infile) { $input_name = $ENV{"PWD"} . "/" . $infile; check_syntax(); eval { $mime_entity = $parser->parse_open($infile) } or pod2usage("$0\: $@"); } else { pod2usage("$0\: You need to specify a filename with -i option"); } my $section = lc(Get_header($mime_entity->head, "section")); if ($section =~ /^general$/) { $section = "" } else { $section .= "/" }; my $base_url = ""; # Flag to convert relative URLs on news items for archiving my $CONVERT_URLS = 0; if ($archive_under) { # News items' dates (%dates->year->month) my $dates = {}; ($infile =~ /news-(\d{4})\.txt/) && (my $db_year = $1); foreach ($mime_entity->parts) { $dates->{isodate2any($_->head, '%Y')}->{isodate2any($_->head, '%m')} = 1 unless isodate2any($_->head, '%Y') ne $db_year; } my $html_p = HTML::Parser->new(api_version => 3, start_h => [\&handle_StartTag, "tagname, attr"], end_h => [\&handle_EndTag, "tagname, tokenpos"], text_h => [\&handle_Text, "dtext" ], declaration_h => [\&handle_Dec, "text"], ); $html_p->attr_encoded(1); $html_p->xml_mode(1); # Create index.html files foreach my $year (keys %{$dates}) { my $dir = $archive_under . "/" . $section . $year; $base_url = $lfs_url . "news/" . $section . $year . "/"; eval { mkpath($dir) unless -d $dir } or die "$0\: Could not create " . $dir . ": $@\n"; open (INDEX, ">$dir/index.html") or die "$0\: Could not open $dir/index.html: $!\n"; if ($top) { $html_p->parse_file($top) or die "$0\: Could not parse " . $top . ": $!\n"; $html_p->eof; print INDEX $buffer; $buffer = ""; } print INDEX "

$year

\n
\n"; if ($bottom) { $html_p->parse_file($bottom) or die "$0\: Could not parse " . $bottom . ": $!\n"; $html_p->eof; print INDEX $buffer; $buffer = ""; } close INDEX; } # Create news archives foreach my $year (keys %{$dates}) { foreach my $month (keys %{$dates->{$year}}) { my $archive_file = $archive_under . "/" . $section . $year . "/" . $month . ".html"; $base_url = $lfs_url . "news/" . $section . $year . "/"; open(NEWS, ">$archive_file") or die "$0\: Could not open $archive_file" . ": $!\n"; if ($top) { $html_p->parse_file($top) or die "$0\: Could not parse " . $top . ": $!\n"; $html_p->eof; print NEWS $buffer; $buffer = ""; } close NEWS; } } foreach my $part ($mime_entity->parts) { next if isodate2any($part->head, '%Y') ne $db_year; my $archive_file = $archive_under . "/" . $section . isodate2any($part->head, '%Y/%m') . ".html"; $base_url = $lfs_url . "news/" . $section . isodate2any($part->head, '%Y') . "/"; open(NEWS, ">>$archive_file") or die "$0\: Could not open $archive_file" . ": $!\n"; $CONVERT_URLS = 1; $html_p->parse(mime2html($part)); $html_p->eof; $CONVERT_URLS = 0; print NEWS $buffer . "\n\n"; $buffer = ""; close NEWS; } foreach my $year (keys %{$dates}) { foreach my $month (keys %{$dates->{$year}}) { my $archive_file = $archive_under . "/" . $section . $year . "/" . $month . ".html"; $base_url = $lfs_url . "news/" . $section . $year . "/"; open(NEWS, ">>$archive_file") or die "$0\: Could not open $archive_file" . ": $!\n"; if ($bottom) { $html_p->parse_file($bottom) or die "$0\: Could not parse " . $bottom . ": $!\n"; $html_p->eof; print NEWS $buffer; $buffer = ""; } close NEWS; } } } else { print "\n

General news

\n" unless $section; # FIXME Workaround for "no section for general news" problem $base_url = $section ? $lfs_url . $section : $lfs_url . "dummy_section/"; foreach my $part (splice(@{[$mime_entity->parts]}, 0, $show_content)) { print mime2html($part); } my @show_as_links = (); if (@{[$mime_entity->parts]} >= $show_content + $show_link) { @show_as_links = splice(@{[$mime_entity->parts]}, $show_content, $show_link); } if (@show_as_links) { print "
\n\t

Previous news:

\n\t\n
\n"; } print "
\n

Latest SVN changes:

\n" unless $section; } if ($parser->results->errors) { print STDERR "$0\: $input_name\: " . $_ foreach ($parser->results->errors); } if ($parser->results->warnings) { print STDERR "$0\: $input_name\: " . $_ foreach ($parser->results->warnings); } ########################## # HTML parser subroutines sub handle_StartTag { my ($tag, $attrs) = @_; if ($$attrs{"href"}) { if ($CONVERT_URLS) { my $old_base_url = $section ? $lfs_url . $section : $lfs_url . "dummy_section/"; $$attrs{"href"} = change_urls($$attrs{"href"}, $old_base_url) unless $$attrs{"href"} =~ /^#/; } else { $$attrs{"href"} = change_urls($$attrs{"href"}) unless $$attrs{"href"} =~ /^#/; } } if ($$attrs{"src"}) { if ($CONVERT_URLS) { my $old_base_url = $section ? $lfs_url . $section : $lfs_url . "dummy_section/"; $$attrs{"src"} = change_urls($$attrs{"src"}, $old_base_url); } else { $$attrs{"src"} = change_urls($$attrs{"src"}); } } $buffer .= "<$tag"; $buffer .= " $_=\"$$attrs{$_}\"" foreach (keys %$attrs); $buffer .= ">"; } sub handle_EndTag { my ($tag, $tokenpos) = @_; if ($buffer =~ /<$tag[^>]*>$/ and !defined($tokenpos)) { $buffer =~ s/>$/ \/>/; } else { $buffer .= ""; } } sub handle_Text { my ($text) = @_; $buffer .= encode_entities($text); } sub handle_Dec { my ($text) = @_; $buffer .= $text; } ###################### # General subroutines sub Get_header { my ($header, $name) = @_; # Extract the field from the header my $hdr_content = $header->get($name); # Die if the field is not found die "$0\: $input_name\: Could not find header field " . $name . "\n" . "Header contents:\n" . $header->as_string . "\n" unless defined($hdr_content); $hdr_content =~ s/^\s*//; $hdr_content =~ s/\s*$//; chomp($hdr_content); return $hdr_content; } # Convert ISO 8601 date (yyyy/mm/dd) to the specified format sub isodate2any { my ($header, $format) = @_; # Date field must be in YYYY/MM/DD format # This check avoids possible cross-site scripting if (Get_header($header, "date") =~ /^(\d{4})\/(\d{2})\/(\d{2})$/) { return strftime($format, 0, 0, 0, $3, $2 - 1, $1 - 1900); } else { die "$0\: $input_name\: Invalid date: " . Get_header($header, "date") . "\n" . "News header:\n---\n" . $header->as_string . "\n---\n"; } } # Translate MIME data to HTML sub mime2html { my ($mime_part) = @_; my $news_id; if (defined($mime_part->head->get("id"))) { $news_id = lc(Get_header($mime_part->head, "id")); } else { # News item ID, created from the news title $news_id = lc(Get_header($mime_part->head, "title")); $news_id =~ s/\W+//g; } my $item_url = "../news/" . $section . isodate2any($mime_part->head, '%Y/%m') . ".html"; my $title = Get_header($mime_part->head, "title"); my $author = Get_header($mime_part->head, "author"); my $date = Get_header($mime_part->head, "date"); my $content = $mime_part->bodyhandle->as_string; # Return the XHTML code return "\t

$title

\n" . "\t\t

$author - $date

\n$content\n"; } # Translate MIME data to links pointing to news items sub mime2link { my ($mime_part) = @_; my $news_id; if (defined($mime_part->head->get("id"))) { $news_id = lc(Get_header($mime_part->head, "id")); } else { # News item ID, created from the news title $news_id = lc(Get_header($mime_part->head, "title")); $news_id =~ s/\W+//g; } my $item_url = change_urls("/news/" . $section . isodate2any($mime_part->head, '%Y/%m') . ".html"); my $title = Get_header($mime_part->head, "title"); my $summary = truncate_text($mime_part->bodyhandle->as_string, $summary_length); return "$title"; } sub truncate_text { my ($text, $length) = @_; $text =~ s/<[^>]+>//g; $text =~ s/^\s+//; $text =~ s/\n/ /g; $text =~ s/\s{2,}/ /g; return substr(encode_entities($text), 0, $length) . "..."; } # Make URLs relative or absolute (see $relative_urls) sub change_urls { my $url = shift @_; my $old_base_url; ($old_base_url = $base_url) unless ($old_base_url = shift @_); # "scheme" returns undef if $url is relative if ($relative_urls and !defined(URI->new($url)->scheme)) { return URI->new($url)->abs($old_base_url)->rel($base_url); } else { return URI->new($url)->abs($old_base_url); } } { # Number of errors reported my $errors = 0; # Validate MIME database for syntax sub check_syntax { my %tests = (); my $cur_line = 0; my $boundary = ""; # Boundary line number of current item my $cur_item = 0; Error("Invalid database filename. It should have the format news-YEAR.txt") unless $infile =~ /news-\d{4}\.txt/; open(FILE, $infile) or die "Cannot open \"$infile\": $!\n"; while () { $cur_line++; if (/^(\s*)Content-Type:\s+multipart\/mixed;/i) { Error("Invalid space character at start of line $cur_line.") if $1; $tests{has_content_type} = 1; } if (/^\s*boundary="([^"]+)"/i) { $tests{has_boundary} = 1; $boundary = $1; } if (/^(\s*)MIME-Version:\s+1\.0\s*$/i) { Error("Invalid space character at start of line $cur_line.") if $1; $tests{has_mime_version} = 1; } if (/^(\s*)Section:\s+[\w-]+\s*$/i) { Error("Invalid space character at start of line $cur_line.") if $1; Error("Invalid section value at line $cur_line.") unless /^Section:\s+[\w-]+\s*$/; $tests{has_section} = 1; } if (/^(\s*)--(\w+)/) { Error("Invalid space character at start of line $cur_line.") if $1; if ($2 ne $boundary and $boundary) { Error("Wrong boundary at line $cur_line; it should be \"$boundary\"."); } elsif ($boundary) { if ($cur_item) { Error("No \"Title\" field found for item at line $cur_item.") unless $tests{has_title}; Error("No \"Author\" field found for item at line $cur_item.") unless $tests{has_author}; Error("No \"Date\" field found for item at line $cur_item.") unless $tests{has_date}; } $cur_item = $cur_line; delete $tests{has_title}; delete $tests{has_author}; delete $tests{has_date}; } } if (/^(\s*)Title:/i) { Error("Invalid space character at start of line $cur_line.") if $1; $tests{has_title} = 1; } if (/^(\s*)Author:/i) { Error("Invalid space character at start of line $cur_line.") if $1; $tests{has_author} = 1; } if (/^(\s*)Date:/i) { Error("Invalid space character at start of line $cur_line.") if $1; $tests{has_date} = 1; } if (/^(\s*)--$boundary--$/) { Error("Invalid space character at start of line $cur_line.") if $1; $tests{has_end_boundary} = 1; } } close FILE; Error("\"Content-Type\" field invalid or not found.") unless $tests{"has_content_type"}; Error("MIME boundary not declared (eg. boundary=\"NEWS_ITEM_BOUNDARY\").") unless $tests{"has_boundary"}; Error("\"MIME-Version\" field invalid or not found.") unless $tests{"has_mime_version"}; Error("\"Section\" field not found.") unless $tests{"has_section"}; Error("MIME end boundary (eg. \"--NEWS_ITEM_BOUNDARY--\") invalid or not found.") unless $tests{"has_end_boundary"}; die "$errors syntax error(s) found on $input_name.\n" if $errors; } sub Error { my ($message) = @_; print STDERR "$0\: $input_name\: $message\n"; $errors++; } } __END__ =head1 NAME manage_news.pl - Script for news management: archiving, last news publishing =head1 SYNOPSIS manage_news.pl [--help|--man] [-t top.html] [-b bottom.html] [-a output_dir] -i mime_db Options: --infile|-i Parse MIME news database from given file --top|-t Prepend top.html to the output --bottom|-b Append bottom.html to the output --archive-under|-a Output news under output_dir/{,section/}YYYY/MM.html --help Show brief help message --man Full documentation =head1 DESCRIPTION B is a script for news management, including archiving and last news publishing. It parses a MIME format news database and extracts news items from it. =head1 OPTIONS =over =item B<--infile mime_db> Specify a MIME news database to parse. =item B<--top top.html, --bottom bottom.html> These options are useful for template insertion. C<--top> prepends the given file to the output, and C<--bottom> appends it. =item B<--archive-under output_dir> Output news under F, where YYYY and MM are numeric values for year and month, respectively. By default, B outputs the five last news to standard output. =item B<--help> Print a brief help message and exits. =item B<--man> Print the manual page and exits. =back =head1 TODO =over =item * Allow date selection by ranges (like "<2003/08/22" or "2003/08/20-2003/08/22") =back =head1 AUTHOR Copyright (C) 2003 Anderson Lizardo This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. =cut