#!/usr/bin/perl
# manage_news.pl - Script for news management: archiving, last news publishing

# Copyright (C) 2003 Anderson Lizardo
# 
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA

use strict;
use warnings;

use File::Path qw(mkpath);
use POSIX qw(strftime);
use Getopt::Long;
use MIME::Parser;
use HTML::Entities qw(encode_entities);
use HTML::Parser;
use URI;
use Pod::Usage;

#######################
# Global configuration

# LFS main URL. All URLs are made relative to this URL, so do not change it
# unless the LFS URL has changed
my $lfs_url = "http://www.linuxfromscratch.org/";

# Use relative URLs?
my $relative_urls = 1;

# Publishing
my $show_content = 2;    # How many items to show fully
my $show_link = 5;       # How many items to show as links
my $summary_length = 100; # Summary text length (used on <a> "text=" attribute)

# End of global configuration
##############################

my $help = 0;
my $man = 0;
my $infile = "";
my $archive_under = "";
my $top = "";
my $bottom = "";

GetOptions(
    "help" => \$help,
    "man" => \$man,
    "infile|i=s" => \$infile,
    "archive-under|a=s" => \$archive_under,
    "top|t=s" => \$top,
    "bottom|b=s" => \$bottom,
) or pod2usage(1);

pod2usage(1) if $help;
pod2usage(-exitstatus => 0, -verbose => 2) if $man;

my $parser = new MIME::Parser;

# These options disable the use of temporary files and use "in-core"
# (memory-only) structures instead.
$parser->output_to_core(1);
$parser->tmp_to_core(1);
$parser->use_inner_files(1);

# Text buffer, used to store parsed HTML data
my $buffer = "";

my $mime_entity;
my $input_name;

# Read input file, given with -i option
if ($infile) {
    $input_name = $ENV{"PWD"} . "/" . $infile;
    check_syntax();
    eval { $mime_entity = $parser->parse_open($infile) } or pod2usage("$0\: $@");
}
else {
    pod2usage("$0\: You need to specify a filename with -i option");
}

my $section = lc(Get_header($mime_entity->head, "section"));
if ($section =~ /^general$/) { $section = "" }
else { $section .= "/" };

my $base_url = "";
# Flag to convert relative URLs on news items for archiving
my $CONVERT_URLS = 0;

if ($archive_under) {
    # News items' dates (%dates->year->month)
    my $dates = {};
    ($infile =~ /news-(\d{4})\.txt/) && (my $db_year = $1);
    foreach ($mime_entity->parts) {
        $dates->{isodate2any($_->head, '%Y')}->{isodate2any($_->head, '%m')} = 1
        unless isodate2any($_->head, '%Y') ne $db_year;
    }

    my $html_p = HTML::Parser->new(api_version => 3,
        start_h => [\&handle_StartTag, "tagname, attr"],
        end_h => [\&handle_EndTag, "tagname, tokenpos"],
        text_h => [\&handle_Text, "dtext" ],
        declaration_h => [\&handle_Dec, "text"],
    );
    $html_p->attr_encoded(1);
    $html_p->xml_mode(1);

    # Create index.html files
    foreach my $year (keys %{$dates}) {
        my $dir = $archive_under . "/" . $section . $year;
        $base_url = $lfs_url . "news/" . $section . $year . "/";
        eval { mkpath($dir) unless -d $dir } or die "$0\: Could not create " . $dir . ": $@\n";
        open (INDEX, ">$dir/index.html") or die "$0\: Could not open $dir/index.html: $!\n";
        if ($top) {
            $html_p->parse_file($top) or die "$0\: Could not parse " . $top . ": $!\n";
            $html_p->eof;
            print INDEX $buffer;
            $buffer = "";
        }
        print INDEX "<h3>$year</h3>\n<ul>\n";
        foreach my $month (sort(keys %{$dates->{$year}})) {
            print INDEX "\t<li><a href=\"$month.html\">" . strftime('%B',0,0,0,1,$month - 1,$year - 1900) .
            "</a></li>\n";
        }
        print INDEX "</ul>\n";
        if ($bottom) {
            $html_p->parse_file($bottom) or die "$0\: Could not parse " . $bottom . ": $!\n";
            $html_p->eof;
            print INDEX $buffer;
            $buffer = "";
        }
        close INDEX;
    }
    # Create news archives
    foreach my $year (keys %{$dates}) {
        foreach my $month (keys %{$dates->{$year}}) {
            my $archive_file = $archive_under . "/" . $section . $year . "/" . $month . ".html";
            $base_url = $lfs_url . "news/" . $section . $year . "/";
            open(NEWS, ">$archive_file") or die "$0\: Could not open $archive_file" . ": $!\n";
            if ($top) {
                $html_p->parse_file($top) or die "$0\: Could not parse " . $top . ": $!\n";
                $html_p->eof;
                print NEWS $buffer;
                $buffer = "";
            }
            close NEWS;
        }
    }
    foreach my $part ($mime_entity->parts) {
        next if isodate2any($part->head, '%Y') ne $db_year;
        my $archive_file = $archive_under . "/" . $section . isodate2any($part->head, '%Y/%m') . ".html";
        $base_url = $lfs_url . "news/" . $section . isodate2any($part->head, '%Y') . "/";
        open(NEWS, ">>$archive_file") or die "$0\: Could not open $archive_file" . ": $!\n";
        $CONVERT_URLS = 1;
        $html_p->parse(mime2html($part));
        $html_p->eof;
        $CONVERT_URLS = 0;
        print NEWS $buffer . "\n\n";
        $buffer = "";
        close NEWS;
    }
    foreach my $year (keys %{$dates}) {
        foreach my $month (keys %{$dates->{$year}}) {
            my $archive_file = $archive_under . "/" . $section . $year . "/" . $month . ".html";
            $base_url = $lfs_url . "news/" . $section . $year . "/";
            open(NEWS, ">>$archive_file") or die "$0\: Could not open $archive_file" . ": $!\n";
            if ($bottom) {
                $html_p->parse_file($bottom) or die "$0\: Could not parse " . $bottom . ": $!\n";
                $html_p->eof;
                print NEWS $buffer;
                $buffer = "";
            }
            close NEWS;
        }
    }
}
else {
    print "</div>\n<div id=\"generalnews\"><h2>General news</h2>\n" unless $section;
    # FIXME Workaround for "no section for general news" problem
    $base_url = $section ? $lfs_url . $section : $lfs_url . "dummy_section/";
    foreach my $part (splice(@{[$mime_entity->parts]}, 0, $show_content)) {
        print mime2html($part);
    }
	my @show_as_links = ();
	if (@{[$mime_entity->parts]} >= $show_content + $show_link) {
		@show_as_links = splice(@{[$mime_entity->parts]}, $show_content, $show_link);
	}
    if (@show_as_links) {
        print "<div class=\"oldnews\">\n\t<h3>Previous news:</h3>\n\t<ul>\n";
        foreach my $part (@show_as_links) {
            print "\t\t<li>" . mime2link($part) . "</li>\n";
        }
        print "\t</ul>\n</div>\n";
    }
    print "</div>\n<div id=\"changelog\"><h2>Latest SVN changes:</h2>\n" unless $section;
}

if ($parser->results->errors) {
    print STDERR "$0\: $input_name\: " . $_ foreach ($parser->results->errors);
}

if ($parser->results->warnings) {
    print STDERR "$0\: $input_name\: " . $_ foreach ($parser->results->warnings);
}

##########################
# HTML parser subroutines

sub handle_StartTag {
    my ($tag, $attrs) = @_;

    if ($$attrs{"href"}) {
        if ($CONVERT_URLS) {
            my $old_base_url = $section ? $lfs_url . $section : $lfs_url . "dummy_section/";
            $$attrs{"href"} = change_urls($$attrs{"href"}, $old_base_url) unless $$attrs{"href"} =~ /^#/;
        }
        else {
            $$attrs{"href"} = change_urls($$attrs{"href"}) unless $$attrs{"href"} =~ /^#/;
        }
    }
    if ($$attrs{"src"}) {
        if ($CONVERT_URLS) {
            my $old_base_url = $section ? $lfs_url . $section : $lfs_url . "dummy_section/";
            $$attrs{"src"} = change_urls($$attrs{"src"}, $old_base_url);
        }
        else {
            $$attrs{"src"} = change_urls($$attrs{"src"});
        }
    }
    $buffer .= "<$tag";
    $buffer .= " $_=\"$$attrs{$_}\"" foreach (keys %$attrs);
    $buffer .= ">";
}

sub handle_EndTag {
    my ($tag, $tokenpos) = @_;
    
    if ($buffer =~ /<$tag[^>]*>$/ and !defined($tokenpos)) {
        $buffer =~ s/>$/ \/>/;
    }
    else {
        $buffer .= "</$tag>";
    }
}

sub handle_Text {
    my ($text) = @_;
    $buffer .= encode_entities($text);
}

sub handle_Dec {
    my ($text) = @_;
    $buffer .= $text;
}

######################
# General subroutines

sub Get_header {
    my ($header, $name) = @_;
    # Extract the field from the header
    my $hdr_content = $header->get($name);
    # Die if the field is not found
    die "$0\: $input_name\: Could not find header field " . $name . "\n" .
    "Header contents:\n" . $header->as_string . "\n" unless defined($hdr_content);
    $hdr_content =~ s/^\s*//;
    $hdr_content =~ s/\s*$//;
    chomp($hdr_content);
    return $hdr_content;
}

# Convert ISO 8601 date (yyyy/mm/dd) to the specified format
sub isodate2any {
    my ($header, $format) = @_;
    # Date field must be in YYYY/MM/DD format
    # This check avoids possible cross-site scripting
    if (Get_header($header, "date") =~ /^(\d{4})\/(\d{2})\/(\d{2})$/) {
        return strftime($format, 0, 0, 0, $3, $2 - 1, $1 - 1900);
    }
    else {
        die "$0\: $input_name\: Invalid date: " . Get_header($header, "date") . "\n" .
        "News header:\n---\n" . $header->as_string . "\n---\n";
    }
}

# Translate MIME data to HTML
sub mime2html {
    my ($mime_part) = @_;

    my $news_id;
    if (defined($mime_part->head->get("id"))) {
        $news_id = lc(Get_header($mime_part->head, "id"));
    }
    else {
        # News item ID, created from the news title
        $news_id = lc(Get_header($mime_part->head, "title"));
        $news_id =~ s/\W+//g;
    }
    my $item_url = "../news/" . $section . isodate2any($mime_part->head, '%Y/%m') . ".html";
    my $title = Get_header($mime_part->head, "title");
    my $author = Get_header($mime_part->head, "author");
    my $date = Get_header($mime_part->head, "date");
    my $content = $mime_part->bodyhandle->as_string;

    # Return the XHTML code
    return "\t<h3 id=\"$news_id\"><a href=\"$item_url#$news_id\">$title</a></h3>\n" .
    "\t\t<h4>$author - $date</h4>\n$content\n";
}

# Translate MIME data to links pointing to news items
sub mime2link {
    my ($mime_part) = @_;

    my $news_id;
    if (defined($mime_part->head->get("id"))) {
        $news_id = lc(Get_header($mime_part->head, "id"));
    }
    else {
        # News item ID, created from the news title
        $news_id = lc(Get_header($mime_part->head, "title"));
        $news_id =~ s/\W+//g;
    }
    my $item_url = change_urls("/news/" . $section . isodate2any($mime_part->head, '%Y/%m') . ".html");
    my $title = Get_header($mime_part->head, "title");
    my $summary = truncate_text($mime_part->bodyhandle->as_string, $summary_length);

    return "<a href=\"$item_url#$news_id\" title=\"$summary\">$title</a>";
}

sub truncate_text {
    my ($text, $length) = @_;

    $text =~ s/<[^>]+>//g;
    $text =~ s/^\s+//;
    $text =~ s/\n/ /g;
    $text =~ s/\s{2,}/ /g;
    return substr(encode_entities($text), 0, $length) . "...";
}

# Make URLs relative or absolute (see $relative_urls)
sub change_urls {
    my $url = shift @_;
    my $old_base_url;
    ($old_base_url = $base_url) unless ($old_base_url = shift @_);

    # "scheme" returns undef if $url is relative
    if ($relative_urls and !defined(URI->new($url)->scheme)) {
        return URI->new($url)->abs($old_base_url)->rel($base_url);
    }
    else {
        return URI->new($url)->abs($old_base_url);
    }
}

{
# Number of errors reported
my $errors = 0;

# Validate MIME database for syntax
sub check_syntax {

    my %tests = ();
    my $cur_line = 0;
    my $boundary = "";
    # Boundary line number of current item
    my $cur_item = 0;

    Error("Invalid database filename. It should have the format news-YEAR.txt")
    unless $infile =~ /news-\d{4}\.txt/;
    open(FILE, $infile) or die "Cannot open \"$infile\": $!\n";
    while (<FILE>) {
        $cur_line++;
        if (/^(\s*)Content-Type:\s+multipart\/mixed;/i) {
            Error("Invalid space character at start of line $cur_line.") if $1;
            $tests{has_content_type} = 1;
        }
        if (/^\s*boundary="([^"]+)"/i) {
            $tests{has_boundary} = 1;
            $boundary = $1;
        }
        if (/^(\s*)MIME-Version:\s+1\.0\s*$/i) {
            Error("Invalid space character at start of line $cur_line.") if $1;
            $tests{has_mime_version} = 1;
        }
        if (/^(\s*)Section:\s+[\w-]+\s*$/i) {
            Error("Invalid space character at start of line $cur_line.") if $1;
            Error("Invalid section value at line $cur_line.") unless /^Section:\s+[\w-]+\s*$/;
            $tests{has_section} = 1;
        }
        if (/^(\s*)--(\w+)/) {
            Error("Invalid space character at start of line $cur_line.") if $1;
            if ($2 ne $boundary and $boundary) {
                Error("Wrong boundary at line $cur_line; it should be \"$boundary\".");
            }
            elsif ($boundary) {
                if ($cur_item) {
                    Error("No \"Title\" field found for item at line $cur_item.") unless $tests{has_title};
                    Error("No \"Author\" field found for item at line $cur_item.") unless $tests{has_author};
                    Error("No \"Date\" field found for item at line $cur_item.") unless $tests{has_date};
                }
                $cur_item = $cur_line;
                delete $tests{has_title};
                delete $tests{has_author};
                delete $tests{has_date};
            }
        }
        if (/^(\s*)Title:/i) {
            Error("Invalid space character at start of line $cur_line.") if $1;
            $tests{has_title} = 1;
        }
        if (/^(\s*)Author:/i) {
            Error("Invalid space character at start of line $cur_line.") if $1;
            $tests{has_author} = 1;
        }
        if (/^(\s*)Date:/i) {
            Error("Invalid space character at start of line $cur_line.") if $1;
            $tests{has_date} = 1;
        }
        if (/^(\s*)--$boundary--$/) {
            Error("Invalid space character at start of line $cur_line.") if $1;
            $tests{has_end_boundary} = 1;
        }
        
    }
    close FILE;
    Error("\"Content-Type\" field invalid or not found.") unless $tests{"has_content_type"};
    Error("MIME boundary not declared (eg. boundary=\"NEWS_ITEM_BOUNDARY\").") unless $tests{"has_boundary"};
    Error("\"MIME-Version\" field invalid or not found.") unless $tests{"has_mime_version"};
    Error("\"Section\" field not found.") unless $tests{"has_section"};
    Error("MIME end boundary (eg. \"--NEWS_ITEM_BOUNDARY--\") invalid or not found.") unless $tests{"has_end_boundary"};
    die "$errors syntax error(s) found on $input_name.\n" if $errors;
}

sub Error {
    my ($message) = @_;
    print STDERR "$0\: $input_name\: $message\n";
    $errors++;
}

}

__END__

=head1 NAME

manage_news.pl - Script for news management: archiving, last news publishing

=head1 SYNOPSIS

manage_news.pl  [--help|--man]  [-t top.html]  [-b bottom.html]  [-a output_dir] -i mime_db

    Options:
        --infile|-i         Parse MIME news database from given file
        --top|-t            Prepend top.html to the output
        --bottom|-b         Append bottom.html to the output
        --archive-under|-a  Output news under output_dir/{,section/}YYYY/MM.html
        --help              Show brief help message
        --man               Full documentation

=head1 DESCRIPTION

B<manage_news.pl> is a script for news management, including archiving and
last news publishing. It parses a MIME format news database and extracts news
items from it.

=head1 OPTIONS

=over

=item B<--infile mime_db>

Specify a MIME news database to parse.

=item B<--top top.html, --bottom bottom.html>

These options are useful for template insertion. C<--top> prepends the
given file to the output, and C<--bottom> appends it.

=item B<--archive-under output_dir>

Output news under F<output_dir/{,section/}YYYY/MM.html>, where YYYY and MM are numeric
values for year and month, respectively. By default, B<manage_news.pl>
outputs the five last news to standard output.

=item B<--help>

Print a brief help message and exits.

=item B<--man>

Print the manual page and exits.

=back

=head1 TODO

=over

=item * Allow date selection by ranges (like "<2003/08/22" or
"2003/08/20-2003/08/22")

=back

=head1 AUTHOR

Copyright (C) 2003 Anderson Lizardo <andersonlizardo@yahoo.com.br>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

=cut

