|
#!/usr/bin/perl -w
##---------------------------------------------------------------------------##
#
# Program: trquo (TRanslate QUOtes)
# Author: Brian <genius@groupbcl.ca> :)
# Date: May 2013
#
# "trquo" intelligently translates straight quote marks (both single and
# double) favoured by plain text editors and most web pages into curly
# quotes according to conventions common in modern written English.
# Accepts input from one or more files named on the command line or on
# stdin; writes output to stdout. Automatically detects HTML files; in
# HTML mode does not convert quote marks within tags, comments, style
# sheets, JavaScript, or <code> blocks, which might break the rendering.
#
# See -h/--help for details on the switches.
#
##---------------------------------------------------------------------------##
# BUUS: This script is part of Brian's Useful Utilities Set
use strict;
use Getopt::Long;
use charnames qw(:full);
#--- Parse the command line switches
my $html_sw = 0;
my $text_sw = 0;
my $ansi_colour = 0; # Debug level 1; if piping through less, use 'less -R'
my $deep_debug = 0; # Debug level 3
my $debug_sw = 0;
my $help_sw = 0;
Getopt::Long::Configure('bundling', 'no_ignore_case');
GetOptions(
'html|H' => \$html_sw,
'text|t|T' => \$text_sw,
'debug|d+' => \$debug_sw,
'help|h' => \$help_sw
);
#--- Help text
if ($help_sw) {
system('perldoc', $0);
exit 0;
}
#--- Debug levels
# Debug level 1: ANSI colour codes (SGR = Select Graphic Rendition)
# Debug level 2: For debugging curr_state visually, replaces the SGR codes in
# the hash with words that show the state; eg, '{TEXT}', '{SCRIPT}'
# Debug level 3: Deep debugging
my $esc = chr(27);
my %sgr = (
text => "$esc\[37;1m", # Bright white text
html_tag => "$esc\[36m", # Cyan HTML tag
html_comment => "$esc\[34m", # Blue HTML comment
script => "$esc\[32m", # Green <script>
style => "$esc\[33;1m", # Yellow <style>
code => "$esc\[35;1m", # Magenta <code>
reset => "$esc\[0m" # reset
);
my ($sgr_in, $sgr_out) = ('', '');
#--- Figure out what to do for each debug level
$ansi_colour = 1 if $debug_sw; # Debugging always turns on ANSI colour
if ($debug_sw == 2) { # Level 2: change the colours into {STATE} words
$sgr{$_} = '{' . uc($_) . '}' foreach keys %sgr;
$sgr{'reset'} = '';
}
$deep_debug = 1 if $debug_sw == 3; # Level 3: Deep debugging
#--- If we have filenames, ensure they all exist and are readable
my @fn_list = @ARGV;
my $i = 0; # 1 = problem with one or more files
foreach my $filename ( @fn_list ) {
if (-f $filename) {
$i = 1, print STDERR "$filename: no read permission" if ! -r $filename;
} else {
$i = 1, print STDERR "$filename: not found" if ! -r $filename;
}
}
exit 1 if $i;
#--- Characters or strings to use when substituting straight quotes
my ($lsquo, $rsquo, $ldquo, $rdquo);
my $curr_state;
#--- Process the input files and write results to stdout
$fn_list[0] = '-' if ! $fn_list[0]; # Read stdin if no files
foreach my $filename ( @fn_list ) {
print "--- Processing '$filename' as " if $debug_sw;
my $mode;
$mode = 'html' if $html_sw;
$mode = 'text' if $text_sw;
open FILE, $filename or die "Error opening HTML file '$filename' for reading: $!";
$curr_state = 'text';
while ( <FILE> ) {
chomp;
if ($. == 1) { # First line: determine mode if not set by user
$mode = /<(!DOCTYPE|html|head|body)/i ? 'html' : 'text' if ! $mode;
print "$mode\n" if $debug_sw;
($lsquo, $rsquo, $ldquo, $rdquo) = get_quote_subst($mode);
binmode(STDOUT, ":raw");
binmode(STDOUT, ":utf8") if $mode eq 'text';
}
print "line $. [", length($_), "]: ", substr($_, 0, 255), "\n" if $deep_debug;
print $mode eq 'html' ? handle_html($_) : handle_text($_), "\n";
}
close FILE;
}
exit 0;
##---------------------------------------------------------------------------##
# Get the characters or strings to use when substituting straight quotes
##---------------------------------------------------------------------------##
sub get_quote_subst {
my $mode = shift;
return ('‘', '’', '“', '”')
if $mode eq 'html';
# Caution: this is not guaranteed to work on ISO-8859 systems
return (
"\N{LEFT SINGLE QUOTATION MARK}",
"\N{RIGHT SINGLE QUOTATION MARK}",
"\N{LEFT DOUBLE QUOTATION MARK}",
"\N{RIGHT DOUBLE QUOTATION MARK}"
)
}
##---------------------------------------------------------------------------##
# Handle HTML text by running it through a simple state machine that teases
# apart displayable text from HTML tags, comments, CSS, and text within
# <code> blocks; quote marks are updated only within displayable text
##---------------------------------------------------------------------------##
sub handle_html {
my $r = '';
my $start_p = 0;
my $next_state;
while ( /([<>]([^<> ]+)?)/g ) {
my $x = $1;
my $c = substr($1, 0, 1);
my $end_p = pos() - length($x);
# A more complete state machine would handle HTML entities
# (e.g. &), but that's not needed here: we're just trying to
# separate text displayed to the user from HTML / CSS / JS.
$next_state = $curr_state;
if ($c eq '<') { # '<': Start HTML element
if ($curr_state eq 'html_comment') {
# No new states can be entered within a comment
} elsif ($curr_state =~ /^(script|style|code)$/) {
# In script, style and code states, no new state can be entered
# we find the appropriate closing tag. Setting the state to
# 'html_tag' will cause a clean transition to 'text' when the
# '>' is encountered.
$next_state = 'html_tag' if $x eq "</$curr_state";
} else {
$next_state = 'html_tag';
$next_state = 'html_comment' if $x eq '<!--';
$next_state = 'html_comment' if $x eq '<code';
$next_state = 'script' if $x eq '<script';
$next_state = 'style' if $x eq '<style';
$next_state = 'code' if $x eq '<code';
}
} else { # '>': End HTML element
$next_state = 'text' if $curr_state eq 'html_tag';
$next_state = 'text' if $curr_state eq 'html_comment'
&& ($end_p > 1 && substr($_, $end_p-2, 3) eq '-->');
}
($sgr_in, $sgr_out) = ($sgr{$curr_state}, $sgr{reset}) if $ansi_colour;
my $segment = substr($_, $start_p, $end_p - $start_p);
$segment = handle_text($segment)
if $segment =~ /['"]/ && $curr_state eq 'text';
if ($deep_debug) {
print " FOUND '$c' (start_p=$start_p, end_p=$end_p)",
"; \$1=", defined $1 ? "|$1|" : 'undef',
", \$2=", defined $2 ? "|$2|" : 'undef', "\n";
print " SEGMENT |${sgr_in}${segment}${sgr_out}|\n";
print " STATE (\$c is '$c') current=$curr_state, next=$next_state\n";
print "[A] line=$. TEXT ($start_p $end_p) |${sgr_in}${segment}${sgr_out}|\n"
if $curr_state eq 'text' && $segment !~ /^\s*$/;
} else {
$r .= $sgr_in . $segment;
$sgr_in = $sgr{$next_state}, $r .= $sgr_out . $sgr_in
if $ansi_colour && $c eq '<';
$r .= $c . $sgr_out;
}
$curr_state = $next_state;
($sgr_in, $sgr_out) = ($sgr{$curr_state}, $sgr{reset}) if $ansi_colour;
$start_p = $end_p + 1;
}
# Handle any text after the last '>' on the line
my $end_p = length($_);
my $segment = substr($_, $start_p, $end_p - $start_p);
$segment = handle_text($segment)
if $segment =~ /['"]/ && $curr_state eq 'text';
if ($deep_debug) {
print "LOOP EXIT (start_p=$start_p, end_p=$end_p) state=$curr_state\n";
print " SEGMENT |${sgr_in}${segment}${sgr_out}|\n";
print "[B] line=$. TEXT ($start_p $end_p) |${sgr_in}${segment}${sgr_out}|\n"
if $curr_state eq 'text' && $segment !~ /^\s*$/;
print "----------";
} else {
$r .= $sgr_in . $segment . $sgr_out;
}
return $r;
}
##---------------------------------------------------------------------------##
# Change straight left/right single and double quotes into curly quotes
# according to the following (English) rules:
# If at the beginning of line, use left-quote
# If preceded by whitespace, use left-quote
# If preceded by an alpahnumeric character, use right-quote
# If followed by an alphanumeric character, use left-quote
# Everything else: Right-quote
##---------------------------------------------------------------------------##
sub handle_text {
my $t = shift;
# Single-quotes
$t =~ s/^'/$lsquo/g;
$t =~ s/([ \t])'/$1$lsquo/g;
$t =~ s/([[:alnum:]])'/$1$rsquo/g;
$t =~ s/'([[:alnum:]])/$lsquo$1/g;
$t =~ s/'/$rsquo/g;
# Double-quotes
$t =~ s/^"/$ldquo/g;
$t =~ s/([ \t])"/$1$ldquo/g;
$t =~ s/([[:alnum:]])"/$1$rdquo/g;
$t =~ s/"([[:alnum:]])/$ldquo$1/g;
$t =~ s/"/$rdquo/g;
return $t;
}
__END__
=head1 NAME
trquo: TRanslate QUotes
=head1 SYNOPSIS
trquo [-HTtd] [--html|--text|--debug|--help] [file ...]
=head1 DESCRIPTION
"trquo" intelligently translates straight quote marks (both single and double)
favoured by plain text editors and most web pages into curly quotes according
to conventions common in modern written English. Accepts input from one or more
files named on the command line or on stdin; writes output to stdout.
Automatically detects HTML files; in HTML mode does not convert quote marks
within tags, comments, stylesheets, JavaScript, or <code> blocks, which might
break the rendering.
=head1 OPTIONS
=over
=item -H, --html
Force all input to be processed as HTML. In HTML mode, trquo does not convert
quote marks within tags, comments, stylesheets, JavaScript, or <code> blocks,
which might break the rendering. In addition, the HTML etities ‘/’
are ouput when translating single quotes to curly quotes, and the entities
“/” are output when translating double quotes to curly quotes.
=item -t, -T, --text
Force all input to be processed as text. All quote marks found are translated,
and UTF-8 characters are output when doing the translation. This option turns
off automatic HTML detection. If used when processing HTML files, it will
produce output that likely cannot be processed correctly by HTML rendering
engines.
=item -d, --debug
Turns on debugging mode. Three debug levels are available, available by using
-d/--debug multiple times.
=over
=item Level 1 ('-d')
Adds colour to the output to indicate what type of text within the HTML stream
trquo thinks it's processing:
White Text visible to the user
Cyan HTML tags
Blue HTML comments
Yellow CSS
Green Text within <script> blocks
Purple Text within <code> blocks
Unlike many programs, debugging output is written to stdout, not stderr. If
you like to run your output through 'less', use 'less -R' to preserve the
colours.
=item Level 2 ('-dd')
Instead of colour, uses tags such as {TEXT}, {HTML_TAG}, {SCRIPT} to
track the state machine as it switches from one state to another.
=item Level 3 ('-ddd')
Deep debugging mode. Provides a detailed trace of the text as it's parsed and
processed by the state machine.
=back
=item -h, --help
This help text
=back
=head1 BUGS
trquo writes only UTF-8 output: it doesn't handle other common formats such as
ISO-8859-1.
trquo does not recognize TeX-like conventions where `` is an opening
double-quote and '' is a closing double-quote.
There are still some edge cases where trquo will render a curly-quote
incorrectly. Known cases are quotes surrounding a stretch of whitespace,
and quote marks within a stretch of non-alphanumeric characters.
=head1 AUTHOR
trquo was written by Brian, <perl-programmers@groupbcl.ca>
# vim: tabstop=4
|