trquo (Source)

#!/usr/bin/perl -w
##---------------------------------------------------------------------------##
#
#   Program:    trquo (TRanslate QUOtes)
#   Author:     Brian <genius@groupbcl.ca> :)
#   Date:       May 2013
#
#   "trquo" intelligently translates straight quote marks (both single and 
#   double) favoured by plain text editors and most web pages into curly 
#   quotes according to conventions common in modern written English. 
#   Accepts input from one or more files named on the command line or on 
#   stdin; writes output to stdout. Automatically detects HTML files; in 
#   HTML mode does not convert quote marks within tags, comments, style
#   sheets, JavaScript, or <code> blocks, which might break the rendering.
#
#   See -h/--help for details on the switches.
#
##---------------------------------------------------------------------------##
# BUUS: This script is part of Brian's Useful Utilities Set

use strict;
use Getopt::Long;
use charnames qw(:full);

#--- Parse the command line switches
my $html_sw = 0;
my $text_sw = 0;
my $ansi_colour = 0;    # Debug level 1; if piping through less, use 'less -R'
my $deep_debug = 0;     # Debug level 3
my $debug_sw = 0;
my $help_sw = 0;

Getopt::Long::Configure('bundling', 'no_ignore_case');
GetOptions(
    'html|H' =>         \$html_sw,
    'text|t|T' =>       \$text_sw,
    'debug|d+' =>       \$debug_sw,
    'help|h' =>         \$help_sw
);

#--- Help text
if ($help_sw) {
    system('perldoc', $0);
    exit 0;
}

#--- Debug levels
# Debug level 1: ANSI colour codes (SGR = Select Graphic Rendition)
# Debug level 2: For debugging curr_state visually, replaces the SGR codes in 
#   the hash with words that show the state; eg, '{TEXT}', '{SCRIPT}'
# Debug level 3: Deep debugging
my $esc = chr(27);
my %sgr = (
    text => "$esc\[37;1m",          # Bright white  text
    html_tag => "$esc\[36m",        # Cyan          HTML tag
    html_comment => "$esc\[34m",    # Blue          HTML comment
    script => "$esc\[32m",          # Green         <script>
    style => "$esc\[33;1m",         # Yellow        <style>
    code => "$esc\[35;1m",          # Magenta       <code>
    reset => "$esc\[0m"             # reset
);
my ($sgr_in, $sgr_out) = ('', '');

#--- Figure out what to do for each debug level
$ansi_colour = 1 if $debug_sw;      # Debugging always turns on ANSI colour
if ($debug_sw == 2) {               # Level 2: change the colours into {STATE} words
    $sgr{$_} = '{' . uc($_) . '}' foreach keys %sgr;
    $sgr{'reset'} = '';
}
$deep_debug = 1 if $debug_sw == 3;  # Level 3: Deep debugging

#--- If we have filenames, ensure they all exist and are readable
my @fn_list = @ARGV;
my $i = 0;  # 1 = problem with one or more files
foreach my $filename ( @fn_list ) {
    if (-f $filename) {
        $i = 1, print STDERR "$filename: no read permission" if ! -r $filename;
    } else {
        $i = 1, print STDERR "$filename: not found" if ! -r $filename;
    }
}
exit 1 if $i;

#--- Characters or strings to use when substituting straight quotes
my ($lsquo, $rsquo, $ldquo, $rdquo);
my $curr_state;

#--- Process the input files and write results to stdout
$fn_list[0] = '-' if ! $fn_list[0];     # Read stdin if no files
foreach my $filename ( @fn_list ) {
    print "--- Processing '$filename' as " if $debug_sw;
    my $mode;
    $mode = 'html' if $html_sw;
    $mode = 'text' if $text_sw;
    open FILE, $filename or die "Error opening HTML file '$filename' for reading: $!";
    $curr_state = 'text';
    while ( <FILE> ) {
        chomp;
        if ($. == 1) {  # First line: determine mode if not set by user
            $mode = /<(!DOCTYPE|html|head|body)/i ? 'html' : 'text' if ! $mode;
            print "$mode\n" if $debug_sw;
            ($lsquo, $rsquo, $ldquo, $rdquo) = get_quote_subst($mode);
            binmode(STDOUT, ":raw");
            binmode(STDOUT, ":utf8") if $mode eq 'text';
        }
        print "line $. [", length($_), "]: ", substr($_, 0, 255), "\n" if $deep_debug;
        print $mode eq 'html' ? handle_html($_) : handle_text($_), "\n";
    }
    close FILE;
}

exit 0;

##---------------------------------------------------------------------------##
#   Get the characters or strings to use when substituting straight quotes
##---------------------------------------------------------------------------##
sub get_quote_subst {
    my $mode = shift;
    return ('&lsquo;', '&rsquo;', '&ldquo;', '&rdquo;')
        if $mode eq 'html';

    # Caution: this is not guaranteed to work on ISO-8859 systems
    return (
        "\N{LEFT SINGLE QUOTATION MARK}",
        "\N{RIGHT SINGLE QUOTATION MARK}",
        "\N{LEFT DOUBLE QUOTATION MARK}",
        "\N{RIGHT DOUBLE QUOTATION MARK}"
    )
}

##---------------------------------------------------------------------------##
#   Handle HTML text by running it through a simple state machine that teases
#   apart displayable text from HTML tags, comments, CSS, and text within
#   <code> blocks; quote marks are updated only within displayable text
##---------------------------------------------------------------------------##
sub handle_html {
    my $r = '';
    my $start_p = 0;
    my $next_state;
    while ( /([<>]([^<> ]+)?)/g ) {
        my $x = $1;
        my $c = substr($1, 0, 1);
        my $end_p = pos() - length($x);

        # A more complete state machine would handle HTML entities
        # (e.g. &amp;), but that's not needed here: we're just trying to
        # separate text displayed to the user from HTML / CSS / JS.
        $next_state = $curr_state;
        if ($c eq '<') {        # '<': Start HTML element
            if ($curr_state eq 'html_comment') {
                # No new states can be entered within a comment
            } elsif ($curr_state =~ /^(script|style|code)$/) {
                # In script, style and code states, no new state can be entered
                # we find the appropriate closing tag. Setting the state to
                # 'html_tag' will cause a clean transition to 'text' when the
                # '>' is encountered.
                $next_state = 'html_tag' if $x eq "</$curr_state";
            } else {
                $next_state = 'html_tag';
                $next_state = 'html_comment' if $x eq '<!--';
                $next_state = 'html_comment' if $x eq '<code';
                $next_state = 'script' if $x eq '<script';
                $next_state = 'style' if $x eq '<style';
                $next_state = 'code' if $x eq '<code';
            }
        } else {        # '>': End HTML element
            $next_state = 'text' if $curr_state eq 'html_tag';
            $next_state = 'text' if $curr_state eq 'html_comment'
                && ($end_p > 1 && substr($_, $end_p-2, 3) eq '-->');
        }

        ($sgr_in, $sgr_out) = ($sgr{$curr_state}, $sgr{reset}) if $ansi_colour;

        my $segment = substr($_, $start_p, $end_p - $start_p);
        $segment = handle_text($segment)
            if $segment =~ /['"]/ && $curr_state eq 'text';

        if ($deep_debug) {
            print "    FOUND '$c' (start_p=$start_p, end_p=$end_p)",
                "; \$1=", defined $1 ? "|$1|" : 'undef',
                ", \$2=", defined $2 ? "|$2|" : 'undef', "\n";
            print "  SEGMENT |${sgr_in}${segment}${sgr_out}|\n";
            print "    STATE (\$c is '$c') current=$curr_state, next=$next_state\n";
            print "[A] line=$. TEXT ($start_p $end_p) |${sgr_in}${segment}${sgr_out}|\n"
                if $curr_state eq 'text' && $segment !~ /^\s*$/;
        } else {
            $r .= $sgr_in . $segment;
            $sgr_in = $sgr{$next_state}, $r .= $sgr_out . $sgr_in
                if $ansi_colour && $c eq '<';
            $r .= $c . $sgr_out;
        }
        $curr_state = $next_state;
        ($sgr_in, $sgr_out) = ($sgr{$curr_state}, $sgr{reset}) if $ansi_colour;
        $start_p = $end_p + 1;
    }

    # Handle any text after the last '>' on the line
    my $end_p = length($_);
    my $segment = substr($_, $start_p, $end_p - $start_p);
    $segment = handle_text($segment)
        if $segment =~ /['"]/ && $curr_state eq 'text';

    if ($deep_debug) {
        print "LOOP EXIT (start_p=$start_p, end_p=$end_p) state=$curr_state\n";
        print "  SEGMENT |${sgr_in}${segment}${sgr_out}|\n";
        print "[B] line=$. TEXT ($start_p $end_p) |${sgr_in}${segment}${sgr_out}|\n"
            if $curr_state eq 'text' && $segment !~ /^\s*$/;
        print "----------";
    } else {
        $r .= $sgr_in . $segment . $sgr_out;
    }

    return $r;
}

##---------------------------------------------------------------------------##
# Change straight left/right single and double quotes into curly quotes
# according to the following (English) rules:
#   If at the beginning of line, use left-quote
#   If preceded by whitespace, use left-quote
#   If preceded by an alpahnumeric character, use right-quote
#   If followed by an alphanumeric character, use left-quote
#   Everything else: Right-quote
##---------------------------------------------------------------------------##
sub handle_text {
    my $t = shift;

    # Single-quotes
    $t =~ s/^'/$lsquo/g;
    $t =~ s/([ \t])'/$1$lsquo/g;
    $t =~ s/([[:alnum:]])'/$1$rsquo/g;
    $t =~ s/'([[:alnum:]])/$lsquo$1/g;
    $t =~ s/'/$rsquo/g;

    # Double-quotes
    $t =~ s/^"/$ldquo/g;
    $t =~ s/([ \t])"/$1$ldquo/g;
    $t =~ s/([[:alnum:]])"/$1$rdquo/g;
    $t =~ s/"([[:alnum:]])/$ldquo$1/g;
    $t =~ s/"/$rdquo/g;

    return $t;
}

__END__

=head1 NAME

trquo: TRanslate QUotes

=head1 SYNOPSIS

  trquo [-HTtd] [--html|--text|--debug|--help] [file ...]

=head1 DESCRIPTION

"trquo" intelligently translates straight quote marks (both single and double) 
favoured by plain text editors and most web pages into curly quotes according 
to conventions common in modern written English. Accepts input from one or more 
files named on the command line or on stdin; writes output to stdout. 
Automatically detects HTML files; in HTML mode does not convert quote marks  
within tags, comments, stylesheets, JavaScript, or <code> blocks, which might 
break the rendering.

=head1 OPTIONS

=over

=item -H, --html

Force all input to be processed as HTML. In HTML mode, trquo does not convert 
quote marks within tags, comments, stylesheets, JavaScript, or <code> blocks,
which might break the rendering. In addition, the HTML etities &lsquo;/&rsquo;
are ouput when translating single quotes to curly quotes, and the entities
&ldquo;/&rdquo; are output when translating double quotes to curly quotes.

=item -t, -T, --text

Force all input to be processed as text. All quote marks found are translated, 
and UTF-8 characters are output when doing the translation. This option turns 
off automatic HTML detection. If used when processing HTML files, it will 
produce output that likely cannot be processed correctly by HTML rendering 
engines.

=item -d, --debug

Turns on debugging mode. Three debug levels are available, available by using
-d/--debug multiple times.

=over

=item Level 1 ('-d')

Adds colour to the output to indicate what type of text within the HTML stream
trquo thinks it's processing:

  White     Text visible to the user
  Cyan      HTML tags
  Blue      HTML comments
  Yellow    CSS
  Green     Text within <script> blocks
  Purple    Text within <code> blocks

Unlike many programs, debugging output is written to stdout, not stderr. If 
you like to run your output through 'less', use 'less -R' to preserve the
colours.

=item Level 2 ('-dd')

Instead of colour, uses tags such as {TEXT}, {HTML_TAG}, {SCRIPT} to
track the state machine as it switches from one state to another.

=item Level 3 ('-ddd')

Deep debugging mode. Provides a detailed trace of the text as it's parsed and
processed by the state machine.

=back

=item -h, --help

This help text

=back

=head1 BUGS

trquo writes only UTF-8 output: it doesn't handle other common formats such as
ISO-8859-1.

trquo does not recognize TeX-like conventions where `` is an opening 
double-quote and '' is a closing double-quote.

There are still some edge cases where trquo will render a curly-quote 
incorrectly. Known cases are quotes surrounding a stretch of whitespace,
and quote marks within a stretch of non-alphanumeric characters.

=head1 AUTHOR

trquo was written by Brian, <perl-programmers@groupbcl.ca>

# vim: tabstop=4