New Python Markdown extension toc_fixer

The toc_fixer extension is implemented as a post-processor.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/python
"""
Table of Contents fixer extension for Python-Markdown
=====================================================

Fixes up the Table of Contents by removing entries up to and including the
"Table of Contents" line (or as defined in the toc extension.)
"""

# (some standard markdown.extension stuff snipped)

def run(self, text):
    # Locate the <div> holding the Table of Contents
    match = re.search(r'(?P<toc><div class="toc">.*?</div>)', text, re.DOTALL)
    if not match:
        return text
    start, end = match.span(1)

    RE_OPEN_TAG = re.compile(r'^<([a-z]{1,2})[a-z]*')
    RE_CLOSE_TAG = re.compile(r'/[a-z]*>$')
    RE_OPEN_LI = re.compile(r' *<li>$')
    RE_A = re.compile(r'(?P<P1> *<a )(?:id="toc-[0-9]{4}" )?(?P<P2>href="#(?P<h1h2_id>.*?)">.*</a>.*)')
    RE_CLOSE_LI = re.compile(r'</li>$')
    RE_HREF = r'<a (id="[^"]+" )?href="#[^"]+">'
    RE_TOC = re.compile(RE_HREF + r'Table of Contents</a>')
    RE_HREF = re.compile(RE_HREF)
    RE_EMPTY_ELEMENT=re.compile(r'^((\s*)<([a-z]+)>\n\2</\3>\n)', re.MULTILINE)

    toc = []
    spaces = ''
    found_TOC_line = False  # True = found line reading 'Table of Contents'

    # Add newline between elements, then process the TOC
    for line in re.sub(r'><', '>\n<', match.group('toc')).split('\n'):
        is_href = True if RE_HREF.match(line) else False

        # Update the leading spaces for an opening tag
        m = RE_OPEN_TAG.match(line)
        if m:
            spaces = spaces + '  '

        # Add line to toc[], unless it's an <href> and we haven't found
        # the 'Table of Contents' line yet
        if not is_href or found_TOC_line:
            toc.append(spaces + line)

        # Have we found the 'Table of Contents' line yet?
        if RE_TOC.match(line):
            found_TOC_line = True

        # Update the leading spaces for a closing tag
        if RE_CLOSE_TAG.search(line):
            spaces = spaces[0:len(spaces)-2]

        # Join three lines (as follows) into one:
        #   <li>
        #     <a id="toc-nnnn" href="#somewhere-in-the-body">Somewhere in the Body</a>
        #   </li>
        if RE_CLOSE_LI.match(line) and RE_OPEN_LI.match(toc[-3]) and RE_A.match(toc[-2]):
            toc.pop()
            a_line = toc.pop()
            toc.append('{}{}</li>'.format(toc.pop(), a_line.lstrip()))

    # Remove empty elements from the TOC
    toc_str = '\n'.join(toc)
    while RE_EMPTY_ELEMENT.search(toc_str):
        toc_str = RE_EMPTY_ELEMENT.sub('', toc_str)

    return text[0:start-1] + toc_str + text[end+1:]