|
#!/usr/bin/awk -f
##---------------------------------------------------------------------------##
# Program: estimate-tar-size
# Author: Brian <genius@groupbcl.ca> :)
# Date: January 2018
#
# Given the ouptut of an "ls -l" or "ls -lR" command, computes the number
# of 512 byte blocks the directories and files in the listing would take
# when packed into a tar file. Output is to stdout in the form '#k',
# (where '#' is the expected size of the tar file in KiB,) which can be
# passed to 'pv' to monitor the progress of creating the tar file.
#
# Example:
# DIR=directory-name
# TAR_SIZE=$(ls -laR $DIR | estimate-tar-size); echo $TAR_SIZE
# tar cf - $DIR | pv -s$TAR_SIZE | bzip2 >$DIR.tar.bz2
#
# Debugging output, written to stderr, can be seen by passing '-vdebug=1'
# to the program:
#
# ls -lR some-directory | estimate-tar-size -vdebug=1 2>&1 | less
#
# The final number will be off by a little bit because this program
# assumes a tar blcking factor of 1, while tar itself usually uses 20.
#
##---------------------------------------------------------------------------##
# BUUS: This script is part of Brian's Useful Utilities Set
BEGIN{ T=0; N=0 }
debug { print "read[" length($0) "] " $0 >"/dev/stderr" }
/^total [0-9]+$/ {
if (debug) { print "> found total line (skipping)" >"/dev/stderr" }
next
}
# If the first line ends with ":", call it a directory line
NR==1 && /:$/ { dir_line = 1 }
# Process a directory line
dir_line {
curr_dir = gensub(/:$/, "/", 1, $0)
if (debug) { print "> found directory line; curr_dir=\"" curr_dir "\"" >"/dev/stderr" }
dir_line = 0
next
}
# Line after a blank line is a directory line
length == 0 { dir_line = 1 }
# Failsafe: if we have less than 9 fields, we don't have a full 'ls' line
! $9 { next }
# Figure out *exactly* where in $0 the file/directory/pipe/etc name starts. We
# can't rely on the name always appearing in the same column because long user
# and group names, or unusually large file sizes, can throw it off. And we
# can't simply look in $0 for the start of the file name, because that can fail
# in a couple of unexpected ways. For example, if the file name is a single
# number, that number might be found in the file length or time field. Or a
# file named 'jane acceptance letter.doc' would have $9=='jane', which might
# also be found in the owner or group field. So we format the fields following
# the user and group names, up to and including the first word of the file
# name, into a string, then determine where in $0 that longish substring
# starts. We then adjust for length(test_str)-length($9).
{
N++; # Number of entries processed
# $1 $2 $3 $4 $5 $6 $7 $8 $9 $10
# brw-r--r-- 1 root root 11, 0 Jan 9 18:01 block-sr0
if (match($1, /^[bc]/) ) { # Block or character file
test_str = sprintf("%s %s %s %2s %5s %s", $5, $6, $7, $8, $9, $10)
fn_start = $10 # (first word of file name)
}
# $1 $2 $3 $4 $5 $6 $7 $8 $9
# -rw-r--r-- 1 root root 2820004 May 23 2013 regular-file
else { # Regular file
test_str = sprintf("%s %s %2s %5s %s", $5, $6, $7, $8, $9)
fn_start = $9 # (first word of file name)
}
i = index($0, test_str)
if (i < 10) {
print "ASSERT: unable to find test_str in $0" >"/dev/stderr"
print " $0 = \"" $0 "\"" >"/dev/stderr"
print " test_str = \"" test_str "\"" >"/dev/stderr"
exit
}
filename = (curr_dir ? curr_dir : "") substr($0, i+length(test_str)-length(fn_start))
symlink_to = ""
if (match(filename, /^(.*) -> (.*)$/, a)) {
filename = a[1]
symlink_to = a[2]
}
if (debug) {
print "> [" length(filename) "] \"" filename "\"" >"/dev/stderr"
if (symlink_to) {
print "-> [" length(symlink_to) "] \"" symlink_to "\"" >"/dev/stderr"
}
}
}
# Start by assuming a zero length entry (valid for dir, pipe, char, block files)
{S=-512}
# For regular files use the actual file size
/^-/ && $5 { S = $5-1 }
debug {
i = int(S/512)+2
print "> +" i " block" (i==1 ? "" : "s") "; " T+i " blocks total" >"/dev/stderr"
}
# Compute the number of blocks this entry will take, including the header block
{ T += int(S/512)+2 }
# Add two blocks if the length of the path and file name exceeds 100 chars
length(filename) > 100 {
if (debug) { print "> +2 blocks (length of file name and path > 100)" >"/dev/stderr" }
T += 2
}
# Add two more blocks if the file is a symlink and the length of the path and
# file name the link points to exceeds 100 chars
length(symlink_to) > 100 {
if (debug) { print "> +2 blocks (length of symlinked file name and path > 100)" >"/dev/stderr" }
T += 2
}
# At the end of the list, add 2 blocks for tar padding, then convert 512-byte
# blocks to kbytes and print. In fact, this program shaves 1 block off the
# final total, otherwise if the number is used in 'pv' it maxes out at 99%
# instead of the desired 100%.
END{
if (debug) { printf("> %13i blocks in " N " entries\n", T+2) >"/dev/stderr" }
printf("%15ik", int((T+1)/2))
if (debug) { print "" }
}
# vim: tabstop=4
|