awk program to merge "fail2ban" logs with other log files

I was interested in seeing the interaction between login failures on penguin and fail2ban. This can be done my merging /var/log/secure and var/log/fail2ban together on date and time, with the complication they use different formats for the time. So I wrote an awk program to standardise the output of the log files.

Sample run:

cd /var/log
awk -f /r/merge-logs-for-fail2ban.awk fail2ban.log secure httpd/*{access,error}_log | sort | less

File /r/merge-logs-for-fail2ban.awk _

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/awk
BEGIN {
    month["Jan"]="01"; month["Feb"]="02"; month["Mar"]="03"
    month["Apr"]="04"; month["May"]="05"; month["Jun"]="06"
    month["Jul"]="07"; month["Aug"]="08"; month["Sep"]="09"
    month["Oct"]="10"; month["Nov"]="11"; month["Dec"]="12"
    curr_year = strftime("%Y", systime())
}
FILENAME != prev_FILENAME {
    print "Processing " FILENAME >"/dev/stderr"
    prev_FILENAME = FILENAME
}
{ source = "" }

# Function to add "milliseconds" (actually it's just an incrementing counter)
# to a timestamp
# Expects a timestamp in format "YYYY-MM-DD HH:MM:SS"
# Returns "YYYY-MM-DD HH:MM:SS,mmm"
function timestamp_msec(date_time) {
    if (date_time == prev_date_time) { msec = msec + 1 } else { msec = 0 }
    prev_date_time = date_time
    x = msec; if (msec < 10) x = "0" x; if (msec < 100) x = "0" x
    return date_time "," x
}

#           $1         $2           $3                      $4      $5-
# fail2ban: 2018-11-07 17:46:07,711 fail2ban.actions        [1466]: message
match($1, /2[01][012][0-9]-[01][0-9]-[012][0-9]/) {
    if ($3 == "fail2ban.filtersystemd") { next }
    date_time = $1 " " $2
    source = "f2b." substr($3, 10)
    pid = $4
    match($0, /\[[0-9]+\]: (.*)/, a)
    message = a[1]
}

# Apache httpd log file
# $1         $2 $3 $4                   $5      $6
# 50.31.96.12 - - [10/Aug/2014:04:03:16 -0500] "GET /assets/nodes_map.css HTTP/1.1" 200 14198
match($4, /\[([012][0-9])\/([ADFJMNOS][aceopu][bcglnprtvy])\/(2[01][012][0-9]):([012][0-9]:[0-5][0-9]:[0-5][0-9])/, a) {
    date_time = timestamp_msec(a[3] "-" month[a[2]] "-" a[1] " " a[4])
    match(FILENAME, /([a-z_]+)_log/, a)
    source = a[1]
    pid = ""
    match ($0, /("[^"]+") ([0-9]+)/, a)
    message = $1 " " a[1] " " a[2]  # IP, request, and status
}

# Apache httpd error file
# $1   $2  $3 $4              $5    $6            $7   $8    $9-
# [Sun Nov 04 03:38:21.299746 2018] [core:notice] [pid 1613] message
#                         a[1]        a[2]     a[3]
match($0, /^\[... ... .. (..:..:..)\.(...)... (....)\]/, a) {
    date_time = a[3] "-" month[$2] "-" $3 " " a[1] "," a[2]
    match(FILENAME, /([a-z_]+)_log/, a)
    source = a[1]
    match($0, /\[pid ([0-9]+)\] (.*)/, a)
    pid = a[1]
    message = a[2]
}

# /var/log/secure
# $1  $2 $3       $4      $5           $6-
# Nov  5 19:14:21 penguin sshd[10119]: message
match($1, /[ADFJMNOS][aceopu][bcglnprtvy]/) {
    date_time = timestamp_msec(curr_year "-" month[$1] "-" ($2 < 10 ? "0" : "") $2 " " $3)
    match($0, /([a-z]+)(\[[0-9]+\]:) (.*)/, a)
    source = a[1]
    pid = a[2]
    message = a[3]
}

# output:   2018-11-07 17:46:07,700 source      [pid]:   meessage
source { printf("%s %-12s %-8s %s\n", date_time, source, pid, message) }