tor/scripts/maint/format_changelog.py

#!/usr/bin/python
# Copyright (c) 2014, The Tor Project, Inc.
# See LICENSE for licensing information
#
# This script reformats a section of the changelog to wrap everything to
# the right width and put blank lines in the right places.  Eventually,
# it might include a linter.
#
# To run it, pipe a section of the changelog (starting with "Changes
# in Tor 0.x.y.z-alpha" through the script.)

import os
import re
import sys
import optparse

# ==============================
# Oh, look!  It's a cruddy approximation to Knuth's elegant text wrapping
# algorithm, with totally ad hoc parameters!
#
# We're trying to minimize:
#    The total of the cubes of ragged space on underflowed intermediate lines,
#  PLUS
#    100 * the fourth power of overflowed characters
#  PLUS
#    .1 * a bit more than the cube of ragged space on the last line.
#  PLUS
#    OPENPAREN_PENALTY for each line that starts with (
#
# We use an obvious dynamic programming algorithm to sorta approximate this.
# It's not coded right or optimally, but it's fast enough for changelogs
#
# (Code found in an old directory of mine, lightly cleaned. -NM)

NO_HYPHENATE=set("""
pf-divert
tor-resolve
tor-gencert
tor-fw-helper
""".split())

LASTLINE_UNDERFLOW_EXPONENT = 1
LASTLINE_UNDERFLOW_PENALTY = 1

UNDERFLOW_EXPONENT = 3
UNDERFLOW_PENALTY = 1

OVERFLOW_EXPONENT = 4
OVERFLOW_PENALTY = 2000

ORPHAN_PENALTY = 10000

OPENPAREN_PENALTY = 200

def generate_wrapping(words, divisions):
    lines = []
    last = 0
    for i in divisions:
        w = words[last:i]
        last = i
        line = " ".join(w).replace("\xff ","-").replace("\xff","-")
        lines.append(line)
    return lines

def wrapping_quality(words, divisions, width1, width2):
    total = 0.0

    lines = generate_wrapping(words, divisions)
    for line in lines:
        length = len(line)
        if line is lines[0]:
            width = width1
        else:
            width = width2

        if line[0:1] == '(':
            total += OPENPAREN_PENALTY

        if length > width:
            total += OVERFLOW_PENALTY * (
                (length - width) ** OVERFLOW_EXPONENT )
        else:
            if line is lines[-1]:
                e,p = (LASTLINE_UNDERFLOW_EXPONENT, LASTLINE_UNDERFLOW_PENALTY)
                if " " not in line:
                    total += ORPHAN_PENALTY
            else:
                e,p = (UNDERFLOW_EXPONENT, UNDERFLOW_PENALTY)

            total += p * ((width - length) ** e)

    return total

def wrap_graf(words, prefix_len1=0, prefix_len2=0, width=72):
    wrapping_after = [ (0,), ]

    w1 = width - prefix_len1
    w2 = width - prefix_len2

    for i in range(1, len(words)+1):
        best_so_far = None
        best_score = 1e300
        for j in range(i):
            t = wrapping_after[j]
            t1 = t[:-1] + (i,)
            t2 = t + (i,)
            wq1 = wrapping_quality(words, t1, w1, w2)
            wq2 = wrapping_quality(words, t2, w1, w2)

            if wq1 < best_score:
                best_so_far = t1
                best_score = wq1
            if wq2 < best_score:
                best_so_far = t2
                best_score = wq2
        wrapping_after.append( best_so_far )

    lines = generate_wrapping(words, wrapping_after[-1])

    return lines

def hyphenatable(word):
    if "--" in word:
        return False

    if re.match(r'^[^\d\-]\D*-', word):
        stripped = re.sub(r'^\W+','',word)
        stripped = re.sub(r'\W+$','',word)
        return stripped not in NO_HYPHENATE
    else:
        return False

def split_paragraph(s):
    "Split paragraph into words; tuned for Tor."

    r = []
    for word in s.split():
        if hyphenatable(word):
            while "-" in word:
                a,word = word.split("-",1)
                r.append(a+"\xff")
        r.append(word)
    return r

def fill(text, width, initial_indent, subsequent_indent):
    words = split_paragraph(text)
    lines = wrap_graf(words, len(initial_indent), len(subsequent_indent),
                      width)
    res = [ initial_indent, lines[0], "\n" ]
    for line in lines[1:]:
        res.append(subsequent_indent)
        res.append(line)
        res.append("\n")
    return "".join(res)

# ==============================


TP_MAINHEAD = 0
TP_HEADTEXT = 1
TP_BLANK = 2
TP_SECHEAD = 3
TP_ITEMFIRST = 4
TP_ITEMBODY = 5
TP_END = 6
TP_PREHEAD = 7

def head_parser(line):
    if re.match(r'^Changes in', line):
        return TP_MAINHEAD
    elif re.match(r'^[A-Za-z]', line):
        return TP_PREHEAD
    elif re.match(r'^  o ', line):
        return TP_SECHEAD
    elif re.match(r'^\s*$', line):
        return TP_BLANK
    else:
        return TP_HEADTEXT

def body_parser(line):
    if re.match(r'^  o ', line):
        return TP_SECHEAD
    elif re.match(r'^    -',line):
        return TP_ITEMFIRST
    elif re.match(r'^      \S', line):
        return TP_ITEMBODY
    elif re.match(r'^\s*$', line):
        return TP_BLANK
    elif re.match(r'^Changes in', line):
        return TP_END
    elif re.match(r'^\s+\S', line):
        return TP_HEADTEXT
    else:
        print "Weird line %r"%line

def clean_head(head):
    return head

def head_score(s):
    m = re.match(r'^ +o (.*)', s)
    if not m:
        print >>sys.stderr, "Can't score %r"%s
        return 99999
    lw = m.group(1).lower()
    if lw.startswith("security") and "feature" not in lw:
        score = -300
    elif lw.startswith("deprecated version"):
        score = -200
    elif (('new' in lw and 'requirement' in lw) or
          ('new' in lw and 'dependenc' in lw) or
          ('build' in lw and 'requirement' in lw) or
          ('removed' in lw and 'platform' in lw)):
        score = -100
    elif lw.startswith("major feature"):
        score = 00
    elif lw.startswith("major bug"):
        score = 50
    elif lw.startswith("major"):
        score = 70
    elif lw.startswith("minor feature"):
        score = 200
    elif lw.startswith("minor bug"):
        score = 250
    elif lw.startswith("minor"):
        score = 270
    else:
        score = 1000

    if 'secur' in lw:
        score -= 2

    if "(other)" in lw:
        score += 2

    if '(' not in lw:
        score -= 1

    return score

class ChangeLog(object):
    def __init__(self, wrapText=True, blogOrder=True, drupalBreak=False):
        self.prehead = []
        self.mainhead = None
        self.headtext = []
        self.curgraf = None
        self.sections = []
        self.cursection = None
        self.lineno = 0
        self.wrapText = wrapText
        self.blogOrder = blogOrder
        self.drupalBreak = drupalBreak

    def addLine(self, tp, line):
        self.lineno += 1

        if tp == TP_MAINHEAD:
            assert not self.mainhead
            self.mainhead = line

        elif tp == TP_PREHEAD:
            self.prehead.append(line)

        elif tp == TP_HEADTEXT:
            if self.curgraf is None:
                self.curgraf = []
                self.headtext.append(self.curgraf)
            self.curgraf.append(line)

        elif tp == TP_BLANK:
            self.curgraf = None

        elif tp == TP_SECHEAD:
            self.cursection = [ self.lineno, line, [] ]
            self.sections.append(self.cursection)

        elif tp == TP_ITEMFIRST:
            item = ( self.lineno, [ [line] ])
            self.curgraf = item[1][0]
            self.cursection[2].append(item)

        elif tp == TP_ITEMBODY:
            if self.curgraf is None:
                self.curgraf = []
                self.cursection[2][-1][1].append(self.curgraf)
            self.curgraf.append(line)

        else:
            assert "This" is "unreachable"

    def lint_head(self, line, head):
        m = re.match(r'^ *o ([^\(]+)((?:\([^\)]+\))?):', head)
        if not m:
            print >>sys.stderr, "Weird header format on line %s"%line

    def lint_item(self, line, grafs, head_type):
        pass

    def lint(self):
        self.head_lines = {}
        for sec_line, sec_head, items in self.sections:
            head_type = self.lint_head(sec_line, sec_head)
            for item_line, grafs in items:
                self.lint_item(item_line, grafs, head_type)

    def dumpGraf(self,par,indent1,indent2=-1):
        if not self.wrapText:
            for line in par:
                print line
            return

        if indent2 == -1:
            indent2 = indent1
        text = " ".join(re.sub(r'\s+', ' ', line.strip()) for line in par)

        sys.stdout.write(fill(text,
                              width=72,
                              initial_indent=" "*indent1,
                              subsequent_indent=" "*indent2))

    def dumpPreheader(self, graf):
        self.dumpGraf(graf, 0)
        print

    def dumpMainhead(self, head):
        print head

    def dumpHeadGraf(self, graf):
        self.dumpGraf(graf, 2)
        print

    def dumpSectionHeader(self, header):
        print header

    def dumpStartOfSections(self):
        pass

    def dumpEndOfSections(self):
        pass

    def dumpEndOfSection(self):
        print

    def dumpEndOfChangelog(self):
        print

    def dumpDrupalBreak(self):
        pass

    def dumpItem(self, grafs):
        self.dumpGraf(grafs[0],4,6)
        for par in grafs[1:]:
            print
            self.dumpGraf(par,6,6)

    def collateAndSortSections(self):
        heads = []
        sectionsByHead = { }
        for _, head, items in self.sections:
            head = clean_head(head)
            try:
                s = sectionsByHead[head]
            except KeyError:
                s = sectionsByHead[head] = []
                heads.append( (head_score(head), head.lower(), head, s) )

            s.extend(items)

        heads.sort()
        self.sections = [ (0, head, items) for _1,_2,head,items in heads ]

    def dump(self):
        if self.prehead:
            self.dumpPreheader(self.prehead)

        if not self.blogOrder:
            self.dumpMainhead(self.mainhead)

        for par in self.headtext:
            self.dumpHeadGraf(par)

        if self.blogOrder:
            self.dumpMainhead(self.mainhead)

        drupalBreakAfter = None
        if self.drupalBreak and len(self.sections) > 4:
            drupalBreakAfter = self.sections[1][2]

        self.dumpStartOfSections()
        for _,head,items in self.sections:
            if not head.endswith(':'):
                print >>sys.stderr, "adding : to %r"%head
                head = head + ":"
            self.dumpSectionHeader(head)
            for _,grafs in items:
                self.dumpItem(grafs)
            self.dumpEndOfSection()
            if items is drupalBreakAfter:
                self.dumpDrupalBreak()
        self.dumpEndOfSections()
        self.dumpEndOfChangelog()

class HTMLChangeLog(ChangeLog):
    def __init__(self, *args, **kwargs):
        ChangeLog.__init__(self, *args, **kwargs)

    def htmlText(self, graf):
        for line in graf:
            line = line.rstrip().replace("&","&amp;")
            line = line.rstrip().replace("<","&lt;").replace(">","&gt;")
            sys.stdout.write(line.strip())
            sys.stdout.write(" ")

    def htmlPar(self, graf):
        sys.stdout.write("<p>")
        self.htmlText(graf)
        sys.stdout.write("</p>\n")

    def dumpPreheader(self, graf):
        self.htmlPar(graf)

    def dumpMainhead(self, head):
        sys.stdout.write("<h2>%s</h2>"%head)

    def dumpHeadGraf(self, graf):
        self.htmlPar(graf)

    def dumpSectionHeader(self, header):
        header = header.replace(" o ", "", 1).lstrip()
        sys.stdout.write("  <li>%s\n"%header)
        sys.stdout.write("  <ul>\n")

    def dumpEndOfSection(self):
        sys.stdout.write("  </ul>\n\n")

    def dumpEndOfChangelog(self):
        pass

    def dumpStartOfSections(self):
        print "<ul>\n"

    def dumpEndOfSections(self):
        print "</ul>\n"

    def dumpDrupalBreak(self):
        print "\n</ul>\n"
        print "<p>&nbsp;</p>"
        print "\n<!--break-->\n\n"
        print "<ul>"

    def dumpItem(self, grafs):
        grafs[0][0] = grafs[0][0].replace(" - ", "", 1).lstrip()
        sys.stdout.write("  <li>")
        if len(grafs) > 1:
            for par in grafs:
                self.htmlPar(par)
        else:
            self.htmlText(grafs[0])
        print

op = optparse.OptionParser(usage="usage: %prog [options] [filename]")
op.add_option('-W', '--no-wrap', action='store_false',
              dest='wrapText', default=True,
              help='Do not re-wrap paragraphs')
op.add_option('-S', '--no-sort', action='store_false',
              dest='sort', default=True,
              help='Do not sort or collate sections')
op.add_option('-o', '--output', dest='output',
              default='-', metavar='FILE', help="write output to FILE")
op.add_option('-H', '--html', action='store_true',
              dest='html', default=False,
              help="generate an HTML fragment")
op.add_option('-1', '--first', action='store_true',
              dest='firstOnly', default=False,
              help="write only the first section")
op.add_option('-b', '--blog-header', action='store_true',
              dest='blogOrder', default=False,
              help="Write the header in blog order")
op.add_option('-B', '--blog', action='store_true',
              dest='blogFormat', default=False,
              help="Set all other options as appropriate for a blog post")
op.add_option('--inplace', action='store_true',
              dest='inplace', default=False,
              help="Alter the ChangeLog in place")
op.add_option('--drupal-break', action='store_true',
              dest='drupalBreak', default=False,
              help='Insert a drupal-friendly <!--break--> as needed')

options,args = op.parse_args()

if options.blogFormat:
    options.blogOrder = True
    options.html = True
    options.sort = False
    options.wrapText = False
    options.firstOnly = True
    options.drupalBreak = True

if len(args) > 1:
    op.error("Too many arguments")
elif len(args) == 0:
    fname = 'ChangeLog'
else:
    fname = args[0]

if options.inplace:
    assert options.output == '-'
    options.output = fname

if fname != '-':
    sys.stdin = open(fname, 'r')

nextline = None

if options.html:
    ChangeLogClass = HTMLChangeLog
else:
    ChangeLogClass = ChangeLog

CL = ChangeLogClass(wrapText=options.wrapText,
                    blogOrder=options.blogOrder,
                    drupalBreak=options.drupalBreak)
parser = head_parser

for line in sys.stdin:
    line = line.rstrip()
    tp = parser(line)

    if tp == TP_SECHEAD:
        parser = body_parser
    elif tp == TP_END:
        nextline = line
        break

    CL.addLine(tp,line)

CL.lint()

if options.output != '-':
    fname_new = options.output+".new"
    fname_out = options.output
    sys.stdout = open(fname_new, 'w')
else:
    fname_new = fname_out = None

if options.sort:
    CL.collateAndSortSections()

CL.dump()

if options.firstOnly:
    sys.exit(0)

if nextline is not None:
    print nextline

for line in sys.stdin:
    sys.stdout.write(line)

if fname_new is not None:
    os.rename(fname_new, fname_out)