tor/scripts/maint/practracker/includes.py

#!/usr/bin/env python
# Copyright 2018 The Tor Project, Inc.  See LICENSE file for licensing info.

"""This script looks through all the directories for files matching *.c or
   *.h, and checks their #include directives to make sure that only "permitted"
   headers are included.

   Any #include directives with angle brackets (like #include <stdio.h>) are
   ignored -- only directives with quotes (like #include "foo.h") are
   considered.

   To decide what includes are permitted, this script looks at a .may_include
   file in each directory.  This file contains empty lines, #-prefixed
   comments, filenames (like "lib/foo/bar.h") and file globs (like lib/*/*.h)
   for files that are permitted.

   The script exits with an error if any non-permitted includes are found.
   .may_include files that contain "!advisory" are considered advisory.
   Advisory .may_include files only result in warnings, rather than errors.
"""

# Future imports for Python 2.7, mandatory in 3.0
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import fnmatch
import os
import re
import sys

if sys.version_info[0] <= 2:
    def open_file(fname):
        return open(fname, 'r')
else:
    def open_file(fname):
        return open(fname, 'r', encoding='utf-8')

def warn(msg):
    print(msg, file=sys.stderr)

def fname_is_c(fname):
    """
    Return true if 'fname' is the name of a file that we should
    search for possibly disallowed #include directives.
    """
    if fname.endswith((".c", ".h")):
        bname = os.path.basename(fname)
        return not bname.startswith((".", "#"))
    else:
        return False

INCLUDE_PATTERN = re.compile(r'\s*#\s*include\s+"([^"]*)"')
RULES_FNAME = ".may_include"

ALLOWED_PATTERNS = [
    re.compile(r'^.*\*\.(h|inc)$'),
    re.compile(r'^.*/.*\.h$'),
    re.compile(r'^ext/.*\.c$'),
    re.compile(r'^orconfig.h$'),
    re.compile(r'^micro-revision.i$'),
]

TOPDIR = "src"

def pattern_is_normal(s):
    for p in ALLOWED_PATTERNS:
        if p.match(s):
            return True
    return False

class Error(object):
    def __init__(self, location, msg, is_advisory=False):
        self.location = location
        self.msg = msg
        self.is_advisory = is_advisory

    def __str__(self):
        return "{} at {}".format(self.msg, self.location)

class Rules(object):
    """ A 'Rules' object is the parsed version of a .may_include file. """
    def __init__(self, dirpath):
        self.dirpath = dirpath
        if dirpath.startswith("src/"):
            self.incpath = dirpath[4:]
        else:
            self.incpath = dirpath
        self.patterns = []
        self.usedPatterns = set()
        self.is_advisory = False

    def addPattern(self, pattern):
        if pattern == "!advisory":
            self.is_advisory = True
            return
        if not pattern_is_normal(pattern):
            warn("Unusual pattern {} in {}".format(pattern, self.dirpath))
        self.patterns.append(pattern)

    def includeOk(self, path):
        for pattern in self.patterns:
            if fnmatch.fnmatchcase(path, pattern):
                self.usedPatterns.add(pattern)
                return True
        return False

    def applyToLines(self, lines, loc_prefix=""):
        lineno = 0
        for line in lines:
            lineno += 1
            m = INCLUDE_PATTERN.match(line)
            if m:
                include = m.group(1)
                if not self.includeOk(include):
                    yield Error("{}{}".format(loc_prefix,str(lineno)),
                                "Forbidden include of {}".format(include),
                                is_advisory=self.is_advisory)

    def applyToFile(self, fname, f):
        for error in self.applyToLines(iter(f), "{}:".format(fname)):
            yield error

    def noteUnusedRules(self):
        for p in self.patterns:
            if p not in self.usedPatterns:
                warn("Pattern {} in {} was never used.".format(p, self.dirpath))

    def getAllowedDirectories(self):
        allowed = []
        for p in self.patterns:
            m = re.match(r'^(.*)/\*\.(h|inc)$', p)
            if m:
                allowed.append(m.group(1))
                continue
            m = re.match(r'^(.*)/[^/]*$', p)
            if m:
                allowed.append(m.group(1))
                continue

        return allowed


def normalize_srcdir(fname):
    """given the name of a source directory or file, return its name
        relative to `src` in a unix-like format.
    """
    orig = fname
    dirname, dirfile = os.path.split(fname)
    if re.match(r'.*\.[ch]$', dirfile):
        fname = dirname

    # Now we have a directory.
    dirname, result = os.path.split(fname)
    for _ in range(100):
        # prevent excess looping in case I missed a tricky case
        dirname, dirpart = os.path.split(dirname)
        if dirpart == 'src' or dirname == "":
            #print(orig,"=>",result)
            return result
        result = "{}/{}".format(dirpart,result)

    print("No progress!")
    assert False

include_rules_cache = {}

def load_include_rules(fname):
    """ Read a rules file from 'fname', and return it as a Rules object.
        Return 'None' if fname does not exist.
    """
    if fname in include_rules_cache:
        return include_rules_cache[fname]
    if not os.path.exists(fname):
        include_rules_cache[fname] = None
        return None
    result = Rules(os.path.split(fname)[0])
    with open_file(fname) as f:
        for line in f:
            line = line.strip()
            if line.startswith("#") or not line:
                continue
            result.addPattern(line)
    include_rules_cache[fname] = result
    return result

def get_all_include_rules():
    """Return a list of all the Rules objects we have loaded so far,
       sorted by their directory names."""
    return [ rules for (fname,rules) in
             sorted(include_rules_cache.items())
             if rules is not None ]

def remove_self_edges(graph):
    """Takes a directed graph in as an adjacency mapping (a mapping from
       node to a list of the nodes to which it connects).

       Remove all edges from a node to itself."""

    for k in list(graph):
        graph[k] = [ d for d in graph[k] if d != k ]

def closure(graph):
    """Takes a directed graph in as an adjacency mapping (a mapping from
       node to a list of the nodes to which it connects), and completes
       its closure.
    """
    graph = graph.copy()
    changed = False
    for k in graph.keys():
        graph[k] = set(graph[k])
    while True:
        for k in graph.keys():
            sz = len(graph[k])
            for v in list(graph[k]):
                graph[k].update(graph.get(v, []))
            if sz != len(graph[k]):
                changed = True

        if not changed:
            return graph
        changed = False

def toposort(graph, limit=100):
    """Takes a directed graph in as an adjacency mapping (a mapping from
       node to a list of the nodes to which it connects).  Tries to
       perform a topological sort on the graph, arranging the nodes into
       "levels", such that every member of each level is only reachable
       by members of later levels.

       Returns a list of the members of each level.

       Modifies the input graph, removing every member that could be
       sorted.  If the graph does not become empty, then it contains a
       cycle.

       "limit" is the max depth of the graph after which we give up trying
       to sort it and conclude we have a cycle.
    """
    all_levels = []

    n = 0
    while graph:
        n += 0
        cur_level = []
        all_levels.append(cur_level)
        for k in list(graph):
            graph[k] = [ d for d in graph[k] if d in graph ]
            if graph[k] == []:
                cur_level.append(k)
        for k in cur_level:
            del graph[k]
        n += 1
        if n > limit:
            break

    return all_levels

def consider_include_rules(fname, f):
    dirpath = os.path.split(fname)[0]
    rules_fname = os.path.join(dirpath, RULES_FNAME)
    rules = load_include_rules(os.path.join(dirpath, RULES_FNAME))
    if rules is None:
        return

    for err in rules.applyToFile(fname, f):
        yield err

    list_unused = False
    log_sorted_levels = False

def walk_c_files(topdir="src"):
    """Run through all .c and .h files under topdir, looking for
       include-rule violations. Yield those violations."""

    for dirpath, dirnames, fnames in os.walk(topdir):
        for fname in fnames:
            if fname_is_c(fname):
                fullpath = os.path.join(dirpath,fname)
                with open(fullpath) as f:
                    for err in consider_include_rules(fullpath, f):
                        yield err

def open_or_stdin(fname):
    if fname == '-':
        return sys.stdin
    else:
        return open(fname)

def check_subsys_file(fname, uses_dirs):
    if not uses_dirs:
        # We're doing a distcheck build, or for some other reason there are
        # no .may_include files.
        print("SKIPPING")
        return False

    uses_dirs = { normalize_srcdir(k) : { normalize_srcdir(d) for d in v }
                  for (k,v) in uses_dirs.items() }
    uses_closure = closure(uses_dirs)
    ok = True
    previous_subsystems = []

    with open_or_stdin(fname) as f:
        for line in f:
            _, name, fname = line.split()
            fname = normalize_srcdir(fname)
            for prev in previous_subsystems:
                if fname in uses_closure[prev]:
                    print("INVERSION: {} uses {}".format(prev,fname))
                    ok = False
            previous_subsystems.append(fname)
    return not ok

def run_check_includes(topdir, list_unused=False, log_sorted_levels=False,
                       list_advisories=False, check_subsystem_order=None):
    trouble = False

    for err in walk_c_files(topdir):
        if err.is_advisory and not list_advisories:
            continue
        print(err, file=sys.stderr)
        if not err.is_advisory:
            trouble = True

    if trouble:
        warn(
    """To change which includes are allowed in a C file, edit the {}
    files in its enclosing directory.""".format(RULES_FNAME))
        sys.exit(1)

    if list_unused:
        for rules in get_all_include_rules():
            rules.noteUnusedRules()

    uses_dirs = { }
    for rules in get_all_include_rules():
        uses_dirs[rules.incpath] = rules.getAllowedDirectories()

    remove_self_edges(uses_dirs)

    if check_subsystem_order:
        if check_subsys_file(check_subsystem_order, uses_dirs):
            sys.exit(1)

    all_levels = toposort(uses_dirs)

    if log_sorted_levels:
        for (n, cur_level) in enumerate(all_levels):
            if cur_level:
                print(n, cur_level)

    if uses_dirs:
        print("There are circular .may_include dependencies in here somewhere:",
              uses_dirs)
        sys.exit(1)

def main(argv):
    import argparse

    progname = argv[0]
    parser = argparse.ArgumentParser(prog=progname)
    parser.add_argument("--toposort", action="store_true",
                        help="Print a topologically sorted list of modules")
    parser.add_argument("--list-unused", action="store_true",
                        help="List unused lines in .may_include files.")
    parser.add_argument("--list-advisories", action="store_true",
                        help="List advisories as well as forbidden includes")
    parser.add_argument("--check-subsystem-order", action="store",
                        help="Check a list of subsystems for ordering")
    parser.add_argument("topdir", default="src", nargs="?",
                        help="Top-level directory for the tor source")
    args = parser.parse_args(argv[1:])

    global TOPDIR
    TOPDIR = args.topdir
    run_check_includes(topdir=args.topdir,
                       log_sorted_levels=args.toposort,
                       list_unused=args.list_unused,
                       list_advisories=args.list_advisories,
                       check_subsystem_order=args.check_subsystem_order)

if __name__ == '__main__':
    main(sys.argv)