#!/usr/bin/perl
# Make an email, (e.g., RISKS.org submission) into UTF-8, no matter what
# charset it arrived with. Also strips boring headers.
# Depends on: apt-get install libemail-mime-perl
# Usage: pipe a message into this program.
# Of course we are talking about piping in a single raw email message,
# with all its original headers. As those headers tell us what charset
# it is in, which we need to know in order to be able to convert to
# UTF-8.
#
# Copyright       : https://www.fsf.org/copyleft/gpl.html
# Author          : Dan Jacobson -- jidanni@jidanni.org https://www.jidanni.org/
# Latest Version  : https://www.jidanni.org/comp/mail/riskfix
# Created On      : Sun Jun 13 19:22:14 2021
# Last Modified On: Mon Jan 17 00:53:09 2022
# Update Count    : 162
use strict;
use warnings FATAL => q(all);
use Email::MIME;
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
my $message;
## my $boggy;
while (<>) {

    # Some users e.g., send utf-8 mail, but somehow their header says something else.

    # Therefore the Moderator, upon spotting a Mess, needs to edit the
    # headers by hand, and re-pipe it through this program.

    # Or we could edit them for him:

    #     if (/^From: .*dannyb\@panix.com/) { $boggy++ }
    #     if ( $boggy && /CHARSET/ ) {
    #         s/iso-8859-1/utf-8/;
    #     }

    # But this is surely a one-time incident, as article submitters
    # here, all being computer professionals, can be convinced to
    # reconfigure their systems.

    # One would think we can use Encode::Guess to help spot such incidents...
    #
    # But like that man page says, "ISO-8859 series is just too successful for
    # most cases (because it fills almost all code points in \x00-\xff)."

    #  Anyway, e.g.,
    #  U+00F1 LATIN SMALL LETTER N WITH TILDE: UTF-8: c3 b1

    # 	is also

    # $ man ISO_8859-1
    # C3       LATIN CAPITAL LETTER A WITH TILDE
    # B1       PLUS-MINUS SIGN

    # so we would have to delve into sniffing the likeliness of juxtapositions...

    $message .= $_;
}
my $parsed = Email::MIME->new($message);
my %p      = $parsed->header_str_pairs;
for (qw/From Subject Date/) {
    printf "%s: %s\n", $_, $p{$_};
}
print "\n";
my $txt;
my $html;
$parsed->walk_parts(
    sub {
        my ($part) = @_;
        if ( !$part->content_type ) {
            die "$0: Hey, there was no content_type etc. header!";
        }
        if ( $part->content_type =~ m[text/html]i ) {
            $html++;
        }
        elsif ( $part->content_type =~ m[text/plain]i ) {
            $txt = $part->body_str;    #converts to Unicode
            return;
        }
    }
);
if ( $html && !$txt ) {
    die
      "$0: Gosh, the message only has a HTML part and well, we aren't ready for that yet.";
## Perhaps start with /usr/share/doc/libhtml-parser-perl/examples/htext
## Or maybe:
## Package: mailtextbody : tool to return the body of an email message
## Mailtextbody reads a complete email message on stdin and returns the body
## on stdout.  Technically speaking, it returns the first decoded text/plain...
}

$_ = $txt || $parsed->body_str;    #converts to Unicode
s/^\n+//;                          #no beginning blank lines
s/[ \t\r]+$//gm;                   #strip trailing blanks etc.
s{\n{3,}}{\n\n}g;                  #compress empty lines
s{\n{2,}$}{\n};                    #no ending blank lines
my $entity_count;
entitiy_check($_) unless $entity_count;
print;

## (For going further and stripping
## https://example.com/bla?junk=at-the-end-of-URLs, I would recommend
## a separate program...)

sub entitiy_check {
    my $s = $_[0];
    $s =~ s/https?:\S+//g;
    if ( $s =~ /&\w+;/ ) {
        print STDERR <<EOF;

*************
* $0: Message seems to contain HTML entities like \"$&\", but we were expecting non-HTML input.
* Sure, one could install the libhtml-parser-perl package, then run
* \$ perl -MHTML::Entities -CSD -wlne 'print decode_entities(\$_);' Message.file
* on it... but note that http(s): URLs often contain similar looking &x=y arguments that
* must not get injured... Anyway to eliminate all Risks, the moderator should clean it up by hand.
*************

EOF
        $entity_count++;
    }
}
