awk program to remove page breaks from RFC files

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/bin/awk -f
# Remove page headers and footers from an RFC downloaded using
# "wget https://www.rfc-editor.org/rfc/rfc####.txt"
#   or
# "elinks -dump https://tools.ietf.org/html/rfc#### >RFC####.text"
# Output is to stdout.

BEGIN { sw = 1 }

# Count blank lines but don't write them
/^[[:space:]]*$/ {
    if (sw > 0) { sw = 0 }
    b_count++
}

# Handle a non-blank line
! /^[[:space:]]*$/ {
    if (sw == 0) sw = 1
    if (sw < 0) { sw-- }    # If we're between pages, count footer/dashes/header
    if (sw == -4) {         # If we're at the header, resume printing
        if (b_sw) { print "" }  # Print a blank line, if needed
        sw = 1; b_count = 0; b_sw = 0
    }
    if (match($0, /\[Page [0-9]+\] *$/)) {  # Found the footer:
        sw = -1;                            # Stop output
        b_sw = b_count > 3;     # true = print blank line when resuming output
    }
    # Print a blank line if the previous line(s) was/were blank
    if (b_count && sw > 0) { print "" }
    b_count = 0
}

# Write output if sw is on
sw > 0 { print $0 }