tor/scripts/maint/format_changelog.py

#!/usr/bin/python
# Copyright (c) 2014, The Tor Project, Inc.
# See LICENSE for licensing information
#
# This script reformats a section of the changelog to wrap everything to
# the right width and put blank lines in the right places.  Eventually,
# it might include a linter.
#
# To run it, pipe a section of the changelog (starting with "Changes
# in Tor 0.x.y.z-alpha" through the script.)

import os
import re
import sys

# ==============================
# Oh, look!  It's a cruddy approximation to Knuth's elegant text wrapping
# algorithm, with totally ad hoc parameters!
#
# We're trying to minimize:
#    The total of the cubes of ragged space on underflowed intermediate lines,
#  PLUS
#    100 * the fourth power of overflowed characters
#  PLUS
#    .1 * a bit more than the cube of ragged space on the last line.
#
# We use an obvious dynamic programming algorithm to sorta approximate this.
# It's not coded right or optimally, but it's fast enough for changelogs
#
# (Code found in an old directory of mine, lightly cleaned. -NM)

NO_HYPHENATE=set("""
pf-divert
""".split())

LASTLINE_UNDERFLOW_EXPONENT = 1
LASTLINE_UNDERFLOW_PENALTY = 1

UNDERFLOW_EXPONENT = 3
UNDERFLOW_PENALTY = 1

OVERFLOW_EXPONENT = 4
OVERFLOW_PENALTY = 2000

ORPHAN_PENALTY = 10000

def generate_wrapping(words, divisions):
    lines = []
    last = 0
    for i in divisions:
        w = words[last:i]
        last = i
        line = " ".join(w).replace("\xff ","-").replace("\xff","-")
        lines.append(line)
    return lines

def wrapping_quality(words, divisions, width1, width2):
    total = 0.0

    lines = generate_wrapping(words, divisions)
    for line in lines:
        length = len(line)
        if line is lines[0]:
            width = width1
        else:
            width = width2

        if length > width:
            total += OVERFLOW_PENALTY * (
                (length - width) ** OVERFLOW_EXPONENT )
        else:
            if line is lines[-1]:
                e,p = (LASTLINE_UNDERFLOW_EXPONENT, LASTLINE_UNDERFLOW_PENALTY)
                if " " not in line:
                    total += ORPHAN_PENALTY
            else:
                e,p = (UNDERFLOW_EXPONENT, UNDERFLOW_PENALTY)

            total += p * ((width - length) ** e)

    return total

def wrap_graf(words, prefix_len1=0, prefix_len2=0, width=72):
    wrapping_after = [ (0,), ]

    w1 = width - prefix_len1
    w2 = width - prefix_len2

    for i in range(1, len(words)+1):
        best_so_far = None
        best_score = 1e300
        for j in range(i):
            t = wrapping_after[j]
            t1 = t[:-1] + (i,)
            t2 = t + (i,)
            wq1 = wrapping_quality(words, t1, w1, w2)
            wq2 = wrapping_quality(words, t2, w1, w2)

            if wq1 < best_score:
                best_so_far = t1
                best_score = wq1
            if wq2 < best_score:
                best_so_far = t2
                best_score = wq2
        wrapping_after.append( best_so_far )

    lines = generate_wrapping(words, wrapping_after[-1])

    return lines

def hyphenateable(word):
    if re.match(r'^[^\d\-].*-', word):
        stripped = re.sub(r'^\W+','',word)
        stripped = re.sub(r'\W+$','',word)
        return stripped not in NO_HYPHENATE
    else:
        return False

def split_paragraph(s):
    "Split paragraph into words; tuned for Tor."

    r = []
    for word in s.split():
        if hyphenateable(word):
            while "-" in word:
                a,word = word.split("-",1)
                r.append(a+"\xff")
        r.append(word)
    return r

def fill(text, width, initial_indent, subsequent_indent):
    words = split_paragraph(text)
    lines = wrap_graf(words, len(initial_indent), len(subsequent_indent),
                      width)
    res = [ initial_indent, lines[0], "\n" ]
    for line in lines[1:]:
        res.append(subsequent_indent)
        res.append(line)
        res.append("\n")
    return "".join(res)

# ==============================


TP_MAINHEAD = 0
TP_HEADTEXT = 1
TP_BLANK = 2
TP_SECHEAD = 3
TP_ITEMFIRST = 4
TP_ITEMBODY = 5
TP_END = 6

def head_parser(line):
    if re.match(r'^[A-Z]', line):
        return TP_MAINHEAD
    elif re.match(r'^  o ', line):
        return TP_SECHEAD
    elif re.match(r'^\s*$', line):
        return TP_BLANK
    else:
        return TP_HEADTEXT

def body_parser(line):
    if re.match(r'^  o ', line):
        return TP_SECHEAD
    elif re.match(r'^    -',line):
        return TP_ITEMFIRST
    elif re.match(r'^      \S', line):
        return TP_ITEMBODY
    elif re.match(r'^\s*$', line):
        return TP_BLANK
    elif re.match(r'^Changes in', line):
        return TP_END
    else:
        print "Weird line %r"%line

class ChangeLog(object):
    def __init__(self):
        self.mainhead = None
        self.headtext = []
        self.curgraf = None
        self.sections = []
        self.cursection = None
        self.lineno = 0

    def addLine(self, tp, line):
        self.lineno += 1

        if tp == TP_MAINHEAD:
            assert not self.mainhead
            self.mainhead = line

        elif tp == TP_HEADTEXT:
            if self.curgraf is None:
                self.curgraf = []
                self.headtext.append(self.curgraf)
            self.curgraf.append(line)

        elif tp == TP_BLANK:
            self.curgraf = None

        elif tp == TP_SECHEAD:
            self.cursection = [ self.lineno, line, [] ]
            self.sections.append(self.cursection)

        elif tp == TP_ITEMFIRST:
            item = ( self.lineno, [ [line] ])
            self.curgraf = item[1][0]
            self.cursection[2].append(item)

        elif tp == TP_ITEMBODY:
            if self.curgraf is None:
                self.curgraf = []
                self.cursection[2][1][-1].append(self.curgraf)
            self.curgraf.append(line)

        else:
            assert "This" is "unreachable"

    def lint_head(self, line, head):
        m = re.match(r'^ *o ([^\(]+)((?:\([^\)]+\))?):', head)
        if not m:
            print >>sys.stderr, "Weird header format on line %s"%line

    def lint_item(self, line, grafs, head_type):
        pass

    def lint(self):
        self.head_lines = {}
        for sec_line, sec_head, items in self.sections:
            head_type = self.lint_head(sec_line, sec_head)
            for item_line, grafs in items:
                self.lint_item(item_line, grafs, head_type)

    def dumpGraf(self,par,indent1,indent2=-1):
        if indent2 == -1:
            indent2 = indent1
        text = " ".join(re.sub(r'\s+', ' ', line.strip()) for line in par)

        sys.stdout.write(fill(text,
                              width=72,
                              initial_indent=" "*indent1,
                              subsequent_indent=" "*indent2))

    def dump(self):
        print self.mainhead
        for par in self.headtext:
            self.dumpGraf(par, 2)
            print
        for _,head,items in self.sections:
            if not head.endswith(':'):
                print >>sys.stderr, "adding : to %r"%head
                head = head + ":"
            print head
            for _,grafs in items:
                self.dumpGraf(grafs[0],4,6)
                for par in grafs[1:]:
                    print
                    self.dumpGraf(par,6,6)
            print
        print

CL = ChangeLog()
parser = head_parser

sys.stdin = open('ChangeLog', 'r')

for line in sys.stdin:
    line = line.rstrip()
    tp = parser(line)

    if tp == TP_SECHEAD:
        parser = body_parser
    elif tp == TP_END:
        nextline = line
        break

    CL.addLine(tp,line)

CL.lint()

sys.stdout = open('ChangeLog.new', 'w')

CL.dump()

print nextline

for line in sys.stdin:
    sys.stdout.write(line)

os.rename('ChangeLog.new', 'ChangeLog')
Reformat the changelog for 0.2.5.4-alpha. No textual changes. Also, add a script to do this, since doing it manually with fmt sucks. 2014-04-24 19:44:24 +02:00			`#!/usr/bin/python`
			`# Copyright (c) 2014, The Tor Project, Inc.`
			`# See LICENSE for licensing information`
			`#`
			`# This script reformats a section of the changelog to wrap everything to`
			`# the right width and put blank lines in the right places. Eventually,`
			`# it might include a linter.`
			`#`
			`# To run it, pipe a section of the changelog (starting with "Changes`
			`# in Tor 0.x.y.z-alpha" through the script.)`

format_changelog.py now formats the first section, in-place. 2014-04-25 08:43:19 +02:00			`import os`
Reformat the changelog for 0.2.5.4-alpha. No textual changes. Also, add a script to do this, since doing it manually with fmt sucks. 2014-04-24 19:44:24 +02:00			`import re`
			`import sys`
Tweak the changelog formatter a little. (I had a bad clone of Knuth's algorithm sitting around in an old code repository of mine. I added orphan detection and smarter hyphenation; it seems to give marginally better results than we had before.) 2014-05-02 18:50:23 +02:00
			`# ==============================`
			`# Oh, look! It's a cruddy approximation to Knuth's elegant text wrapping`
			`# algorithm, with totally ad hoc parameters!`
			`#`
			`# We're trying to minimize:`
			`# The total of the cubes of ragged space on underflowed intermediate lines,`
			`# PLUS`
			`# 100 * the fourth power of overflowed characters`
			`# PLUS`
			`# .1 * a bit more than the cube of ragged space on the last line.`
			`#`
			`# We use an obvious dynamic programming algorithm to sorta approximate this.`
			`# It's not coded right or optimally, but it's fast enough for changelogs`
			`#`
			`# (Code found in an old directory of mine, lightly cleaned. -NM)`

			`NO_HYPHENATE=set("""`
			`pf-divert`
			`""".split())`

			`LASTLINE_UNDERFLOW_EXPONENT = 1`
			`LASTLINE_UNDERFLOW_PENALTY = 1`

			`UNDERFLOW_EXPONENT = 3`
			`UNDERFLOW_PENALTY = 1`

			`OVERFLOW_EXPONENT = 4`
			`OVERFLOW_PENALTY = 2000`

			`ORPHAN_PENALTY = 10000`

			`def generate_wrapping(words, divisions):`
			`lines = []`
			`last = 0`
			`for i in divisions:`
			`w = words[last:i]`
			`last = i`
			`line = " ".join(w).replace("\xff ","-").replace("\xff","-")`
			`lines.append(line)`
			`return lines`

			`def wrapping_quality(words, divisions, width1, width2):`
			`total = 0.0`

			`lines = generate_wrapping(words, divisions)`
			`for line in lines:`
			`length = len(line)`
			`if line is lines[0]:`
			`width = width1`
			`else:`
			`width = width2`

			`if length > width:`
			`total += OVERFLOW_PENALTY * (`
			`(length - width) ** OVERFLOW_EXPONENT )`
			`else:`
			`if line is lines[-1]:`
			`e,p = (LASTLINE_UNDERFLOW_EXPONENT, LASTLINE_UNDERFLOW_PENALTY)`
			`if " " not in line:`
			`total += ORPHAN_PENALTY`
			`else:`
			`e,p = (UNDERFLOW_EXPONENT, UNDERFLOW_PENALTY)`

			`total += p * ((width - length) ** e)`

			`return total`

			`def wrap_graf(words, prefix_len1=0, prefix_len2=0, width=72):`
			`wrapping_after = [ (0,), ]`

			`w1 = width - prefix_len1`
			`w2 = width - prefix_len2`

			`for i in range(1, len(words)+1):`
			`best_so_far = None`
			`best_score = 1e300`
			`for j in range(i):`
			`t = wrapping_after[j]`
			`t1 = t[:-1] + (i,)`
			`t2 = t + (i,)`
			`wq1 = wrapping_quality(words, t1, w1, w2)`
			`wq2 = wrapping_quality(words, t2, w1, w2)`

			`if wq1 < best_score:`
			`best_so_far = t1`
			`best_score = wq1`
			`if wq2 < best_score:`
			`best_so_far = t2`
			`best_score = wq2`
			`wrapping_after.append( best_so_far )`

			`lines = generate_wrapping(words, wrapping_after[-1])`

			`return lines`

			`def hyphenateable(word):`
			`if re.match(r'^[^\d\-].*-', word):`
			`stripped = re.sub(r'^\W+','',word)`
			`stripped = re.sub(r'\W+$','',word)`
			`return stripped not in NO_HYPHENATE`
			`else:`
			`return False`

			`def split_paragraph(s):`
			`"Split paragraph into words; tuned for Tor."`

			`r = []`
			`for word in s.split():`
			`if hyphenateable(word):`
			`while "-" in word:`
			`a,word = word.split("-",1)`
			`r.append(a+"\xff")`
			`r.append(word)`
			`return r`

			`def fill(text, width, initial_indent, subsequent_indent):`
			`words = split_paragraph(text)`
			`lines = wrap_graf(words, len(initial_indent), len(subsequent_indent),`
			`width)`
			`res = [ initial_indent, lines[0], "\n" ]`
			`for line in lines[1:]:`
			`res.append(subsequent_indent)`
			`res.append(line)`
			`res.append("\n")`
			`return "".join(res)`

			`# ==============================`

Reformat the changelog for 0.2.5.4-alpha. No textual changes. Also, add a script to do this, since doing it manually with fmt sucks. 2014-04-24 19:44:24 +02:00
			`TP_MAINHEAD = 0`
			`TP_HEADTEXT = 1`
			`TP_BLANK = 2`
			`TP_SECHEAD = 3`
			`TP_ITEMFIRST = 4`
			`TP_ITEMBODY = 5`
format_changelog.py now formats the first section, in-place. 2014-04-25 08:43:19 +02:00			`TP_END = 6`
Reformat the changelog for 0.2.5.4-alpha. No textual changes. Also, add a script to do this, since doing it manually with fmt sucks. 2014-04-24 19:44:24 +02:00
			`def head_parser(line):`
			`if re.match(r'^[A-Z]', line):`
			`return TP_MAINHEAD`
			`elif re.match(r'^ o ', line):`
			`return TP_SECHEAD`
			`elif re.match(r'^\s*$', line):`
			`return TP_BLANK`
			`else:`
			`return TP_HEADTEXT`

			`def body_parser(line):`
			`if re.match(r'^ o ', line):`
			`return TP_SECHEAD`
			`elif re.match(r'^ -',line):`
			`return TP_ITEMFIRST`
			`elif re.match(r'^ \S', line):`
			`return TP_ITEMBODY`
			`elif re.match(r'^\s*$', line):`
			`return TP_BLANK`
format_changelog.py now formats the first section, in-place. 2014-04-25 08:43:19 +02:00			`elif re.match(r'^Changes in', line):`
			`return TP_END`
Reformat the changelog for 0.2.5.4-alpha. No textual changes. Also, add a script to do this, since doing it manually with fmt sucks. 2014-04-24 19:44:24 +02:00			`else:`
			`print "Weird line %r"%line`

			`class ChangeLog(object):`
			`def __init__(self):`
			`self.mainhead = None`
			`self.headtext = []`
			`self.curgraf = None`
			`self.sections = []`
			`self.cursection = None`
			`self.lineno = 0`

			`def addLine(self, tp, line):`
			`self.lineno += 1`

			`if tp == TP_MAINHEAD:`
			`assert not self.mainhead`
			`self.mainhead = line`

			`elif tp == TP_HEADTEXT:`
			`if self.curgraf is None:`
			`self.curgraf = []`
			`self.headtext.append(self.curgraf)`
			`self.curgraf.append(line)`

			`elif tp == TP_BLANK:`
			`self.curgraf = None`

			`elif tp == TP_SECHEAD:`
			`self.cursection = [ self.lineno, line, [] ]`
			`self.sections.append(self.cursection)`

			`elif tp == TP_ITEMFIRST:`
			`item = ( self.lineno, [ [line] ])`
			`self.curgraf = item[1][0]`
			`self.cursection[2].append(item)`

			`elif tp == TP_ITEMBODY:`
			`if self.curgraf is None:`
			`self.curgraf = []`
			`self.cursection[2][1][-1].append(self.curgraf)`
			`self.curgraf.append(line)`

			`else:`
			`assert "This" is "unreachable"`

			`def lint_head(self, line, head):`
			`m = re.match(r'^ *o ([^\(]+)((?:\([^\)]+\))?):', head)`
			`if not m:`
			`print >>sys.stderr, "Weird header format on line %s"%line`

			`def lint_item(self, line, grafs, head_type):`
			`pass`

			`def lint(self):`
			`self.head_lines = {}`
			`for sec_line, sec_head, items in self.sections:`
			`head_type = self.lint_head(sec_line, sec_head)`
			`for item_line, grafs in items:`
			`self.lint_item(item_line, grafs, head_type)`

			`def dumpGraf(self,par,indent1,indent2=-1):`
			`if indent2 == -1:`
			`indent2 = indent1`
			`text = " ".join(re.sub(r'\s+', ' ', line.strip()) for line in par)`
Tweak the changelog formatter a little. (I had a bad clone of Knuth's algorithm sitting around in an old code repository of mine. I added orphan detection and smarter hyphenation; it seems to give marginally better results than we had before.) 2014-05-02 18:50:23 +02:00
			`sys.stdout.write(fill(text,`
			`width=72,`
			`initial_indent=" "*indent1,`
			`subsequent_indent=" "*indent2))`
Reformat the changelog for 0.2.5.4-alpha. No textual changes. Also, add a script to do this, since doing it manually with fmt sucks. 2014-04-24 19:44:24 +02:00
			`def dump(self):`
			`print self.mainhead`
			`for par in self.headtext:`
			`self.dumpGraf(par, 2)`
			`print`
			`for _,head,items in self.sections:`
			`if not head.endswith(':'):`
			`print >>sys.stderr, "adding : to %r"%head`
			`head = head + ":"`
			`print head`
			`for _,grafs in items:`
			`self.dumpGraf(grafs[0],4,6)`
			`for par in grafs[1:]:`
			`print`
			`self.dumpGraf(par,6,6)`
			`print`
			`print`

			`CL = ChangeLog()`
			`parser = head_parser`

format_changelog.py now formats the first section, in-place. 2014-04-25 08:43:19 +02:00			`sys.stdin = open('ChangeLog', 'r')`

Reformat the changelog for 0.2.5.4-alpha. No textual changes. Also, add a script to do this, since doing it manually with fmt sucks. 2014-04-24 19:44:24 +02:00			`for line in sys.stdin:`
			`line = line.rstrip()`
			`tp = parser(line)`

			`if tp == TP_SECHEAD:`
			`parser = body_parser`
format_changelog.py now formats the first section, in-place. 2014-04-25 08:43:19 +02:00			`elif tp == TP_END:`
			`nextline = line`
			`break`

			`CL.addLine(tp,line)`
Reformat the changelog for 0.2.5.4-alpha. No textual changes. Also, add a script to do this, since doing it manually with fmt sucks. 2014-04-24 19:44:24 +02:00
			`CL.lint()`
format_changelog.py now formats the first section, in-place. 2014-04-25 08:43:19 +02:00
			`sys.stdout = open('ChangeLog.new', 'w')`

Reformat the changelog for 0.2.5.4-alpha. No textual changes. Also, add a script to do this, since doing it manually with fmt sucks. 2014-04-24 19:44:24 +02:00			`CL.dump()`
format_changelog.py now formats the first section, in-place. 2014-04-25 08:43:19 +02:00
			`print nextline`

			`for line in sys.stdin:`
			`sys.stdout.write(line)`

			`os.rename('ChangeLog.new', 'ChangeLog')`