genTOC (Source)

#!/usr/bin/perl -w
##---------------------------------------------------------------------------##
#   Generates a nested table of contents for a file, based on the formatting
#   model I initially saw in the MySQL documentation.
#
#   The script looks for lines that look like the following:
#
#   Some text
#   =========
#
#   Next heading level
#   ------------------
#
#       Another heading level
#       ~~~~~~~~~~~~~~~~~~~~~
#
#   Note the underline, which consists of repeating non-alphanumeric characters
#   of the same length as the text on the line above. Levels are assigned
#   automatically as each new underline is encountered.
#
#   The above might be formatted in the Table of Contents as:
#       8  Some text
#      11    Next heading level
#      14      Another heading level
#
#   The table of contents is inserted after the line 'Table of Contents' (and
#   its underline) is encountered. Any existing table of contents is replaced.
##---------------------------------------------------------------------------##
# BUUS: This script is part of Brian's Useful Utilities Set

# The program 'setext-headings' promotes or demotes headings in an existing
# file, which can be useful at times.

use strict;

die "genTOC: Require the name of one or more text files" if ! $ARGV[0];

my @text;       # Text from the file
my @text_stat;  # stat($text_fn)

my @toc;        # Entries are ARRAY refs: [0]=line num, [1]=level, [2]=text
use constant TOC_LINE_NUM => 0;
use constant TOC_LEVEL    => 1;
use constant TOC_TEXT     => 2;

my ($toc_start, $toc_end) = (0, 0);
my %level;      # Key = underline character, data = level

my $code_fence;     # Found in markdown files


##---------------------------------------------------------------------------##
#   Open the text file, get its inode data, determine where its existing table
#   of contents is, and build a new toc array. Return 0 if no line reading
#   'Table of Contents' line is found.
##---------------------------------------------------------------------------##
sub read_text_file {
    my $text_fn = shift;
    print STDERR "$text_fn\n";

    open TEXT, $text_fn or die "  Error opening TEXT file '$text_fn' for reading: $!";
    @text_stat = stat($text_fn);

    @toc = (); ($toc_start, $toc_end) = (0, 0); my $toc_sw = 0;
    %level = ();

    # Read the text file, determine where its existing table of contents is, and
    # build a new table
    @text = ( undef );  # [0]=undef keeps array (0-based) and $. (1-based) in sync
    while (<TEXT>) {
        chomp;
        push @text, $_;

        if ($toc_sw) {      # Look for the end of the table of contents
            $toc_sw = 0;                    # Assume this is not a TOC line
            $toc_sw = 1 if /^\s*$/;         # Blank lines are TOC lines
            $toc_sw = 1 if /^\s*\d+\.? /;   # So are ' ####. TEXT' lines
            $toc_sw = 1 if /^\s*([^[:alnum:]])\1+\s*$/ && $. == $toc_start+1;
            if (! $toc_sw) {    # We found a non-TOC line: back up to last non-empty line
                $toc_end = $. - 1;
                $toc_end-- while $text[$toc_end] =~ /^\s*$/;
            }
        }

        $toc_start = $., $toc_sw = 1 if /^\s*Table of Contents/i && ! $toc_start;

        my $handled_as_ToC = 0;
        if (/^(\s*)(([^[:alnum:]` ])\3+)\s*$/) {    # This looks like an AUC underline
            # Check if this is a closing code fence
            if ($code_fence && /^ ? ? ?(([`~])\2\2+)/ && length($1) >= length($code_fence)) {
                $code_fence='', $_='';
                next;
            }

            # Check for an AUC underline
            my ($ul_lead, $underline) = ($1, $2);
            $text[$. - 1] =~ /^(\s*)(.*)\s*$/;
            my ($h_lead, $header) = ($1, $2);
            if (! $code_fence                   # Can't be within a code block
                    && $ul_lead ne "\t"         # "Spaces" before can't be a tab
                    && length($ul_lead) < 4     # Must start in columns 1, 2, or 3
                    && $h_lead eq $ul_lead      # Header and underline must start at same column
                    && length($header) - length($underline) <= 1
                    && length($header) - length($underline) >= -2
            ) {
                my $u_char = substr($underline, 0, 1);
                $level{$u_char} = scalar(keys(%level)) if ! defined $level{$u_char};
                if ($toc_start && lc($header) ne 'table of contents') {
                    push @toc, [ $.-1, $level{$u_char}, $header ];
                    $handled_as_ToC = 1;
                }
            }
        }

        # Handle a Markdown ATX heading
        if (! $code_fence && /^ ? ? ?(#+)\s+(.*)/) {
            push @toc, [ $. - 1, length($1)-1, $2 ];
            $handled_as_ToC = 1;
        }

        # Check for an opening code fence
        $code_fence = $code_fence ? '' : $1
            if ! $handled_as_ToC && /^ ? ? ?(([~`])\2\2+)/;
    }
    close TEXT;

    print STDERR "  Did not find a 'Table of Contents' line\n" if ! $toc_start;
    return $toc_start;
}

##---------------------------------------------------------------------------##
#       Determine where the existing table of contents starts and stops
##---------------------------------------------------------------------------##
sub find_curr_toc {
    # Determine where the Table of Contents actually starts within the text
    $toc_start++;   # Skip 'Table of Contents'; next line: skip underline
    $toc_start++ if $text[$toc_start] =~ /^\s*([^[:alnum:]])\1+\s*$/;
    $toc_start++ while $text[$toc_start] =~ /^\s*$/;    # Skip blank lines

    # If the Table of Contents ends before it starts, it means we have an empty table
    if ($toc_end < $toc_start) {
        $toc_start--;
        $toc_end = $toc_start-1;
    }
}


##---------------------------------------------------------------------------##
#       Format the TOC array for insertion into the main text
##---------------------------------------------------------------------------##
sub format_new_toc {
    # If we have sub-levels, go through the TOC and insert a blank line before every
    # level 0 entry
    if (scalar(keys(%level)) > 1) {
        for (my $i = @toc-1; $i > 1; $i--) {
            splice @toc, $i, 0, '' if $toc[$i][1] == 0;
        }
    }

    my $curr_toc_length = $toc_end-$toc_start+1;
    my $new_toc_length = @toc;
    my $w = length($toc[-1][TOC_LINE_NUM] + $new_toc_length);   # w = width of last line number in TOC
    foreach (@toc) {
        next if ! ref($_);
        my $i = $$_[TOC_LINE_NUM];
        $i += ($new_toc_length - $curr_toc_length) if $$_[TOC_LINE_NUM] > $toc_start;
        my $x = sprintf("  %${w}i  %s%s", $i, '  ' x $$_[TOC_LEVEL], $$_[TOC_TEXT]);
        $_ = $x;
    }
}

##---------------------------------------------------------------------------##
#   Splice the new table of contents into the text array and rewrite the
#   text file
##---------------------------------------------------------------------------##
sub update_text_file {
    my $text_fn = shift;

    # Splice in the table of contents
    my $curr_toc_length = $toc_end-$toc_start+1;
    my $new_toc_length = @toc;
    print STDERR "  Removing existing table of contents ($curr_toc_length lines) from text\n" if $curr_toc_length;
    print STDERR "  Inserting new table of contents ($new_toc_length lines) into text at line $toc_start\n";
    splice @text, $toc_start, $curr_toc_length, @toc;

    # Re-write the file (preserve its timestamps)
    open TEXT, "> $text_fn" or die "Error opening TEXT file '$text_fn' for writing: $!";
    for (my $i = 1; $i < @text; $i++) { print TEXT "$text[$i]\n" }
    close TEXT;
    utime $text_stat[8], $text_stat[9], $text_fn;
}

##---------------------------------------------------------------------------##
#                   M A I N   P R O C E S S I N G
##---------------------------------------------------------------------------##
foreach my $text_fn (@ARGV) {
    if (read_text_file($text_fn)) {
        find_curr_toc();
        format_new_toc();
        update_text_file($text_fn);
    }
}

# vim: tabstop=4