estimate-tar-size (Source)

#!/usr/bin/awk -f
##---------------------------------------------------------------------------##
#   Program:    estimate-tar-size
#   Author:     Brian <genius@groupbcl.ca> :)
#   Date:       January 2018
#
#   Given the ouptut of an "ls -l" or "ls -lR" command, computes the number
#   of 512 byte blocks the directories and files in the listing would take
#   when packed into a tar file. Output is to stdout in the form '#k',
#   (where '#' is the expected size of the tar file in KiB,) which can be
#   passed to 'pv' to monitor the progress of creating the tar file.
#
#   Example:
#       DIR=directory-name
#       TAR_SIZE=$(ls -laR $DIR | estimate-tar-size); echo $TAR_SIZE
#       tar cf - $DIR | pv -s$TAR_SIZE | bzip2 >$DIR.tar.bz2
#
#   Debugging output, written to stderr, can be seen by passing '-vdebug=1'
#   to the program:
#
#       ls -lR some-directory | estimate-tar-size -vdebug=1 2>&1 | less
#
#   The final number will be off by a little bit because this program
#   assumes a tar blcking factor of 1, while tar itself usually uses 20.
#
##---------------------------------------------------------------------------##
# BUUS: This script is part of Brian's Useful Utilities Set

BEGIN{ T=0; N=0 }

debug { print "read[" length($0) "] " $0 >"/dev/stderr" }

/^total [0-9]+$/ {
    if (debug) { print ">  found total line (skipping)" >"/dev/stderr" }
    next
}

# If the first line ends with ":", call it a directory line
NR==1 && /:$/ { dir_line = 1 }
# Process a directory line
dir_line {
    curr_dir = gensub(/:$/, "/", 1, $0)
    if (debug) { print ">  found directory line; curr_dir=\"" curr_dir "\"" >"/dev/stderr" }
    dir_line = 0
    next
}
# Line after a blank line is a directory line
length == 0 { dir_line = 1 }

# Failsafe: if we have less than 9 fields, we don't have a full 'ls' line
! $9 { next }

# Figure out *exactly* where in $0 the file/directory/pipe/etc name starts. We
# can't rely on the name always appearing in the same column because long user
# and group names, or unusually large file sizes, can throw it off.  And we
# can't simply look in $0 for the start of the file name, because that can fail
# in a couple of unexpected ways. For example, if the file name is a single
# number, that number might be found in the file length or time field.  Or a
# file named 'jane acceptance letter.doc' would have $9=='jane', which might
# also be found in the owner or group field.  So we format the fields following
# the user and group names, up to and including the first word of the file
# name, into a string, then determine where in $0 that longish substring
# starts. We then adjust for length(test_str)-length($9).
{
    N++;    # Number of entries processed
#   $1        $2   $3    $4  $5 $6  $7 $8    $9       $10
#   brw-r--r-- 1 root  root  11, 0 Jan  9 18:01 block-sr0
    if (match($1, /^[bc]/) ) {  # Block or character file
        test_str = sprintf("%s %s %s %2s %5s %s", $5, $6, $7, $8, $9, $10)
        fn_start = $10  # (first word of file name)
    }
#   $1        $2   $3    $4       $5  $6 $7    $8           $9
#   -rw-r--r-- 1 root  root  2820004 May 23  2013 regular-file
    else {                      # Regular file
        test_str = sprintf("%s %s %2s %5s %s", $5, $6, $7, $8, $9)
        fn_start = $9   # (first word of file name)
    }
    i = index($0, test_str)
    if (i < 10) {
        print "ASSERT: unable to find test_str in $0" >"/dev/stderr"
        print "  $0 = \"" $0 "\"" >"/dev/stderr"
        print "  test_str = \"" test_str "\"" >"/dev/stderr"
        exit
    }
    filename = (curr_dir ? curr_dir : "") substr($0, i+length(test_str)-length(fn_start))

    symlink_to = ""
    if (match(filename, /^(.*) -> (.*)$/, a)) {
        filename = a[1]
        symlink_to = a[2]
    }

    if (debug) {
        print ">  [" length(filename) "] \"" filename "\"" >"/dev/stderr"
        if (symlink_to) {
            print "-> [" length(symlink_to) "] \"" symlink_to "\"" >"/dev/stderr"
        }
    }
}

# Start by assuming a zero length entry (valid for dir, pipe, char, block files)
{S=-512}

# For regular files use the actual file size
/^-/ && $5 { S = $5-1 }

debug {
    i = int(S/512)+2
    print ">  +" i " block" (i==1 ? "" : "s") "; " T+i " blocks total" >"/dev/stderr"
}

# Compute the number of blocks this entry will take, including the header block
{ T += int(S/512)+2 }

# Add two blocks if the length of the path and file name exceeds 100 chars
length(filename) > 100 {
    if (debug) { print ">  +2 blocks (length of file name and path > 100)" >"/dev/stderr" }
    T += 2
}

# Add two more blocks if the file is a symlink and the length of the path and
# file name the link points to exceeds 100 chars
length(symlink_to) > 100 {
    if (debug) { print ">  +2 blocks (length of symlinked file name and path > 100)" >"/dev/stderr" }
    T += 2
}

# At the end of the list, add 2 blocks for tar padding, then convert 512-byte
# blocks to kbytes and print. In fact, this program shaves 1 block off the
# final total, otherwise if the number is used in 'pv' it maxes out at 99%
# instead of the desired 100%.
END{
    if (debug) { printf("> %13i blocks in " N " entries\n", T+2) >"/dev/stderr" }
    printf("%15ik", int((T+1)/2))
    if (debug) { print "" }
}

# vim: tabstop=4