#!/usr/bin/perl
# htmldebloater -- debloat HTML so it can be comfotably read even a
# turn of the century black and white PDA
# Copyright       : http://www.fsf.org/copyleft/gpl.html
# Created On      : December 2004
# Last Modified On: Sat Feb  6 02:47:12 2010
# Update Count    : 23
# Inspiration     : Dan Jacobson -- http://jidanni.org/comp
# Actual brains   : Sam Watkins <swatkins@fastmail.fm>

use warnings;
use strict;

use HTML::Entities;

# Only tags in this list will get through.
my $ok_tags = set(
    qw(
      html head body
      title base meta
      p br hr
      a img
      table th tr td
      b i u em strong
      center
      blockquote
      ul ol li dl dt dd
      h1 h2 h3 h4 h5 h6
      pre
      div
      )
);

# Attributes in this list will get through no matter what tag they are in.
# - I can't think of any yet.
my $ok_attr = set(
    qw(
      )
);

# a list of allowed attributes for each tag
my $ok_tag_attr = {
    td   => set(qw(colspan rowspan align valign)),
    a    => set(qw(href name)),
    img  => set(qw(src width height alt title)),
    base => set(qw(href)),
    meta => set(qw(http-equiv content)),
};

# a list of tags where we want to hide the content between <foo> and </foo>
my $kill_containers = set(
    qw(
      script style
      select textarea
      )
);

# Tags in this list will be filtered based on a predicate over their attributes.
# The "predicate" could also change the attribute names or values, delete attributes, etc.
my $tag_filter = {
    meta => sub { my $attr = $_[0]; $attr->{'http-equiv'} && $attr->{content} }
};

use strict;
use warnings;
use HTML::Parser;

my $in_dead_container = 0;

my $parser = HTML::Parser->new(
    text_h => [ \&pass_through, "text" ],
    declaration_h => [ \&pass_through, "text" ],
    start_h => [ \&start_tag, "self, tagname, attrseq, attr" ],
    end_h   => [ \&end_tag,   "self, tagname, text" ]
    #	no comments will get through
    #	default_h => [ sub    { print shift },        'text' ],
);
$parser->parse_file(*STDIN);

sub pass_through {
    print shift unless $in_dead_container;
}

sub start_tag {
    my ( $self, $tag, $attrseq, $attr ) = @_;
    if ( $ok_tags->{$tag} && !$in_dead_container ) {
        $attrseq =
          [ grep { $ok_attr->{$_} || $ok_tag_attr->{$tag}{$_} } @$attrseq ];
	my $keep_tag = 1;
	if (my $pred = $tag_filter->{$tag}) {
	    $keep_tag = $pred->($attr);
	}
	if ($keep_tag) {
	    print format_start_tag( $tag, $attrseq, $attr );
	}
    }
    if ( $kill_containers->{$tag} ) {
        ++$in_dead_container;
    }
}

sub end_tag {
    my ( $self, $tag, $text ) = @_;
    print $text if $ok_tags->{$tag} && !$in_dead_container;
    if ( $kill_containers->{$tag} ) {
        --$in_dead_container;
    }
}

sub set {
    return { map { $_, 1 } @_ };
}

sub format_start_tag {
    my ( $tag, $attrseq, $attr ) = @_;
    my $out = "<$tag";
    for (@$attrseq) {
        $out .= " $_";
	if ( defined($attr->{$_}) ) {
	    $out .= '="' . encode_entities( $attr->{$_} ) . '"';
	}
    }
    $out .= '>';
    return $out;
}
