From 61fb65ce4c4a37d7a4542a3d2dd38647dd263c71 Mon Sep 17 00:00:00 2001 From: Diego Wilson Date: Thu, 7 Sep 2017 17:44:35 -0400 Subject: [PATCH] Add repo_diff tools These tools help compare android repo workspaces for high level analysis. Test: Run python repo_diff/repo_diff_android.py Change-Id: I645351521c7f61735d3ce65a93569983cd28851c --- tools/repo_diff/exclusions.txt | 1 + tools/repo_diff/git_commits_not_upstreamed.py | 150 ++++++ tools/repo_diff/repo_diff_android.py | 166 ++++++ tools/repo_diff/repo_diff_downstream.py | 491 ++++++++++++++++++ 4 files changed, 808 insertions(+) create mode 100644 tools/repo_diff/exclusions.txt create mode 100644 tools/repo_diff/git_commits_not_upstreamed.py create mode 100644 tools/repo_diff/repo_diff_android.py create mode 100644 tools/repo_diff/repo_diff_downstream.py diff --git a/tools/repo_diff/exclusions.txt b/tools/repo_diff/exclusions.txt new file mode 100644 index 000000000..965a3f5fa --- /dev/null +++ b/tools/repo_diff/exclusions.txt @@ -0,0 +1 @@ +platform/prebuilts/.* diff --git a/tools/repo_diff/git_commits_not_upstreamed.py b/tools/repo_diff/git_commits_not_upstreamed.py new file mode 100644 index 000000000..30e798b8a --- /dev/null +++ b/tools/repo_diff/git_commits_not_upstreamed.py @@ -0,0 +1,150 @@ +"""List downstream commits that are not upstream and are visible in the diff. + +Only include changes that are visible when you diff +the downstream and usptream branches. + +This will naturally exclude changes that already landed upstream +in some form but were not merged or cherry picked. + +This will also exclude changes that were added then reverted downstream. + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import argparse +import os +import subprocess + + +def git(args): + """Git command. + + Args: + args: A list of arguments to be sent to the git command. + + Returns: + The output of the git command. + """ + + command = ['git'] + command.extend(args) + with open(os.devnull, 'w') as devull: + return subprocess.check_output(command, stderr=devull) + + +class CommitFinder(object): + + def __init__(self, working_dir, upstream, downstream): + self.working_dir = working_dir + self.upstream = upstream + self.downstream = downstream + + def __call__(self, filename): + insertion_commits = set() + + if os.path.isfile(os.path.join(self.working_dir, filename)): + blame_output = git(['-C', self.working_dir, 'blame', '-l', + '%s..%s' % (self.upstream, self.downstream), + '--', filename]) + for line in blame_output.splitlines(): + # The commit is the first field of a line + blame_fields = line.split(' ', 1) + # Some lines can be empty + if blame_fields: + insertion_commits.add(blame_fields[0]) + + return insertion_commits + + +def find_insertion_commits(upstream, downstream, working_dir): + """Finds all commits that insert lines on top of the upstream baseline. + + Args: + upstream: Upstream branch to be used as a baseline. + downstream: Downstream branch to search for commits missing upstream. + working_dir: Run as if git was started in this directory. + + Returns: + A set of commits that insert lines on top of the upstream baseline. + """ + + insertion_commits = set() + + diff_files = git(['-C', working_dir, 'diff', + '--name-only', + '--diff-filter=d', + upstream, + downstream]) + diff_files = diff_files.splitlines() + + finder = CommitFinder(working_dir, upstream, downstream) + commits_per_file = [finder(filename) for filename in diff_files] + + for commits in commits_per_file: + insertion_commits.update(commits) + + return insertion_commits + + +def find(upstream, downstream, working_dir): + """Finds downstream commits that are not upstream and are visible in the diff. + + Args: + upstream: Upstream branch to be used as a baseline. + downstream: Downstream branch to search for commits missing upstream. + working_dir: Run as if git was started in thid directory. + + Returns: + A set of downstream commits missing upstream. + """ + + commits_not_upstreamed = set() + revlist_output = git(['-C', working_dir, 'rev-list', '--no-merges', + '%s..%s' % (upstream, downstream)]) + downstream_only_commits = set(revlist_output.splitlines()) + insertion_commits = set() + + # If there are no downstream-only commits there's no point in + # futher filtering + if downstream_only_commits: + insertion_commits = find_insertion_commits(upstream, downstream, + working_dir) + + # The commits that are only downstream and are visible in 'git blame' are the + # ones that insert lines in the diff between upstream and downstream. + commits_not_upstreamed.update( + downstream_only_commits.intersection(insertion_commits)) + + # TODO(diegowilson) add commits that deleted lines + + return commits_not_upstreamed + + +def main(): + parser = argparse.ArgumentParser( + description='Finds commits yet to be applied upstream.') + parser.add_argument( + 'upstream', + help='Upstream branch to be used as a baseline.', + ) + parser.add_argument( + 'downstream', + help='Downstream branch to search for commits missing upstream.', + ) + parser.add_argument( + '-C', + '--working_directory', + help='Run as if git was started in thid directory', + default='.',) + args = parser.parse_args() + upstream = args.upstream + downstream = args.downstream + working_dir = os.path.abspath(args.working_directory) + + print('\n'.join(find(upstream, downstream, working_dir))) + + +if __name__ == '__main__': + main() diff --git a/tools/repo_diff/repo_diff_android.py b/tools/repo_diff/repo_diff_android.py new file mode 100644 index 000000000..b3039da7c --- /dev/null +++ b/tools/repo_diff/repo_diff_android.py @@ -0,0 +1,166 @@ +#!/usr/bin/python +"""Diff a repo (downstream) and its upstream. + +This script: + 1. Downloads a repo source tree with specified manifest URL, branch + and release tag. + 2. Retrieves the BUILD_ID from $downstream/build/core/build_id.mk. + 3. Downloads the upstream using the BUILD_ID. + 4. Diffs each project in these two repos. +""" + +import argparse +import os +import subprocess +import repo_diff_downstream + +HELP_MSG = "Diff a repo (downstream) and its upstream" + +WORKSPACE = "workspace" +UPSTREAM_WORKSPACE = "upstream_workspace" + +DEFAULT_MANIFEST_URL = "https://android.googlesource.com/platform/manifest" +DEFAULT_MANIFEST_BRANCH = "oreo-dev" +DEFAULT_UPSTREAM_MANIFEST_URL = "https://android.googlesource.com/platform/manifest" +DEFAULT_UPSTREAM_MANIFEST_BRANCH = "master" +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +DEFAULT_EXCLUSIONS_FILE = os.path.join(SCRIPT_DIR, "exclusions.txt") + + +def parse_args(): + """Parse args.""" + + parser = argparse.ArgumentParser(description=HELP_MSG) + + parser.add_argument("-u", "--manifest-url", + help="manifest url", + default=DEFAULT_MANIFEST_URL) + parser.add_argument("-b", "--manifest-branch", + help="manifest branch", + default=DEFAULT_MANIFEST_BRANCH) + parser.add_argument("-r", "--upstream-manifest-url", + help="upstream manifest url", + default=DEFAULT_UPSTREAM_MANIFEST_URL) + parser.add_argument("-a", "--upstream-manifest-branch", + help="upstream manifest branch", + default=DEFAULT_UPSTREAM_MANIFEST_BRANCH) + parser.add_argument("-e", "--exclusions-file", + help="exclusions file", + default=DEFAULT_EXCLUSIONS_FILE) + parser.add_argument("-t", "--tag", + help="release tag (optional). If not set then will" + "sync the latest in the branch.") + + return parser.parse_args() + + +def repo_init(url, rev, workspace): + """Repo init with specific url and rev. + + Args: + url: manifest url + rev: manifest branch, or rev + workspace: the folder to init and sync code + """ + + print("repo init:\n url: %s\n rev: %s\n workspace: %s" % + (url, rev, workspace)) + + subprocess.check_output("repo init --manifest-url=%s --manifest-branch=%s" % + (url, rev), cwd=workspace, shell=True) + + +def repo_sync(workspace, retry=5): + """Repo sync.""" + + count = 0 + while count < retry: + count += 1 + print("repo sync (retry=%d/%d):\n workspace: %s" % + (count, retry, workspace)) + + try: + subprocess.check_output(("repo sync --jobs=24 --current-branch --quiet " + "--no-tags --no-clone-bundle"), + cwd=workspace, shell=True) + except subprocess.CalledProcessError as e: + print "Error: %s" % e.output + # Stop retrying if the repo sync was successful + else: + break + + +def get_commit_with_keyword(project_path, keyword): + """Get the latest commit in $project_path with the specific keyword.""" + + return subprocess.check_output(("git -C %s " + "rev-list --max-count=1 --grep=\"%s\" " + "HEAD") % + (project_path, keyword), shell=True).rstrip() + + +def get_build_id(workspace): + """Get BUILD_ID defined in $workspace/build/core/build_id.mk.""" + + path = os.path.join(workspace, "build", "core", "build_id.mk") + return subprocess.check_output("source %s && echo $BUILD_ID" % path, + shell=True).rstrip() + + +def repo_sync_specific_release(url, branch, tag, workspace): + """Repo sync source with the specific release tag.""" + + if not os.path.exists(workspace): + os.makedirs(workspace) + + manifest_path = os.path.join(workspace, ".repo", "manifests") + + repo_init(url, branch, workspace) + if tag: + rev = get_commit_with_keyword(manifest_path, tag) + repo_init(url, rev, workspace) + repo_sync(workspace) + + +def diff(manifest_url, manifest_branch, tag, upstream_manifest_url, + upstream_manifest_branch, exclusions_file): + """Syncs and diffs an Android workspace against an upstream workspace.""" + + workspace = os.path.abspath(WORKSPACE) + upstream_workspace = os.path.abspath(UPSTREAM_WORKSPACE) + # repo sync downstream source tree + repo_sync_specific_release( + manifest_url, + manifest_branch, + tag, + workspace) + + # get the build_id so that we know which rev of upstream we need + build_id = get_build_id(workspace) + + # repo sync upstream source tree + repo_sync_specific_release( + upstream_manifest_url, + upstream_manifest_branch, + build_id, + upstream_workspace) + + # do the comparison + repo_diff_downstream.diff( + upstream_workspace, + workspace, + os.path.abspath("project.csv"), + os.path.abspath("commit.csv"), + os.path.abspath(exclusions_file), + ) + + +def main(): + args = parse_args() + + diff(args.manifest_url, args.manifest_branch, args.tag, + args.upstream_manifest_url, args.upstream_manifest_branch, + args.exclusions_file) + +if __name__ == "__main__": + main() diff --git a/tools/repo_diff/repo_diff_downstream.py b/tools/repo_diff/repo_diff_downstream.py new file mode 100644 index 000000000..d6e3691e2 --- /dev/null +++ b/tools/repo_diff/repo_diff_downstream.py @@ -0,0 +1,491 @@ +"""Diffs one repo source tree an upstream repo source tree. + +Matches the projects from a Gerrit repo workspace to the projects +of an upstream workspace. After identifying exist both in the +downstream and the upstream workspace it then diffs the each project. + +Finally, the results of the project matching and diffing are reported. + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import argparse +import csv +import datetime +import multiprocessing +import os +import re +import subprocess +import xml.etree.ElementTree as et +import git_commits_not_upstreamed + + +def get_projects(source_tree): + """Retrieve the dict of projects names and paths. + + Args: + source_tree: A path to the source tree. + + Returns: + A dict of project paths keyed by project names. + """ + + projects = {} + + manifest = source_tree + '/.repo/manifest.xml' + tree = et.parse(manifest) + root = tree.getroot() + + for project in root.findall('project'): + # Ignore projects that are not synced by default + if 'notdefault' in project.get('groups', ''): + continue + path = project.get('path', project.get('name')) + path = os.path.abspath(os.path.join(source_tree, path)) + name = project.get('name') + projects[name] = path + + return projects + + +def git(args): + """Git command. + + Args: + args: A list of arguments to be sent to the git command. + + Returns: + The output of the git command. + """ + + command = ['git'] + command.extend(args) + with open(os.devnull, 'w') as devull: + return subprocess.check_output(command, stderr=devull) + + +def get_revision_diff_stats(directory, rev_a, rev_b): + """Retrieves stats of diff between two git revisions. + + Args: + directory: A path to the git directory to diff. + rev_a: A git revision to diff. + rev_b: A git revision to diff. + + Returns: + A dict with the count of files modified, lines added + and lines removed. + """ + stats = { + 'file': 0, + 'insertion': 0, + 'deletion': 0, + } + + git_diffstat = git( + ['-C', directory, 'diff', '--shortstat', rev_a, rev_b]) + for element in git_diffstat.split(','): + for key in stats: + if key in element: + stats[key] = int(element.split()[0]) + + return stats + + +def get_project_stats(upstream_dir, downstream_dir): + """Retrieves stats of diff between two git projects. + + Diffs a downstream directory against an upstream directory. + Lines that exist only in the downstream directory are considered insertions. + Lines that exist only in the upstream directory are considered deletions. + + Args: + upstream_dir: A path to the upstream directory to compare. + downstream_dir: A path to the downstream directory to compare. + + Returns: + A dict with the count of files modified, lines added + and lines removed. + """ + stats = { + 'file': 0, + 'insertion': 0, + 'deletion': 0, + } + + if upstream_dir and downstream_dir: + print('Diffing %s vs %s' % (downstream_dir, upstream_dir)) + git(['-C', downstream_dir, 'fetch', '--update-shallow', upstream_dir]) + stats = get_revision_diff_stats(downstream_dir, 'FETCH_HEAD', 'HEAD') + + return stats + + +def match_project_by_root_commits( + downstream_project_name, downstream_project_path, upstream_root_commits): + """Match a downstream project to an upstream project using their root commits. + + Find all root commits in a downstream project and find a matching + upstream project that have a root commit in common. + + Args: + downstream_project_name: A string with the downstream project name. + downstream_project_path: A string with the downstream project path. + upstream_root_commits: A dict of root commits and their upstream project. + + Returns: + A string with the matched upstream project name. + """ + upstream_match = None + downstream_root_commits = find_root_commits_in_path(downstream_project_path) + for root in downstream_root_commits: + if root in upstream_root_commits: + upstream_project_list = upstream_root_commits[root] + if len(upstream_project_list) > 1: + print('Warning: ' + downstream_project_name + + ' matches multiple projects') + print(upstream_project_list) + else: + upstream_match = upstream_project_list[0]['name'] + # Once there's a root commit match, stop looking for a project match + break + + return upstream_match + + +def match_projects(upstream_projects, downstream_projects): + """Match downstream projects to upstream projects. + + Args: + upstream_projects: A dict of upstream projects. + downstream_projects: A dict of downstream projects. + + Returns: + A list of upstream and downstream project pairs. + """ + + project_matches = [] + + # keep a list of upstream projects that have not been matched + unmatched_upstream_projects = set(upstream_projects.keys()) + + upstream_root_commits = find_root_commits_in_projects(upstream_projects) + # Match all downstream projects to an upstream project + for downstream_name, downstream_path in downstream_projects.iteritems(): + # First try to match projects by name + if downstream_name in upstream_projects: + upstream_match = downstream_name + # If there is no project name match then try matching by commit + else: + upstream_match = match_project_by_root_commits( + downstream_name, downstream_path, upstream_root_commits) + + project_matches.append({ + 'upstream': upstream_match, + 'downstream': downstream_name, + }) + unmatched_upstream_projects.discard(upstream_match) + + # Add all upstream projects that have not been matched + for project in unmatched_upstream_projects: + project_matches.append({ + 'upstream': project, + 'downstream': None, + }) + + return project_matches + + +def filter_exclusion_list(projects, exclusion_file): + """Removes all projects that match the exclusion patterns.""" + + filtered = {} + + exclusion_list = [] + if exclusion_file: + with open(exclusion_file) as f: + exclusion_list = f.readlines() + exclusion_list = [line.strip() for line in exclusion_list] + exclusion_pattern = '|'.join(exclusion_list) + + if exclusion_pattern: + for name, path in projects.iteritems(): + if re.match(exclusion_pattern, name): + print('Excluding ' + name) + else: + filtered[name] = path + else: + filtered = projects + + return filtered + + +def get_all_projects_stats(upstream_source_tree, downstream_source_tree, + exclusion_file): + """Finds the stats of all project in a source tree. + + Args: + upstream_source_tree: A string with the path to the upstream gerrit + source tree. + downstream_source_tree: A string with the path to the downstream gerrit + source tree. + exclusion_file: A string with the path to the exclusion file. + + Returns: + A dict of matching upstream and downstream projects + including stats for projects that matches. + """ + all_project_stats = [] + + upstream_projects = get_projects(upstream_source_tree) + downstream_projects = get_projects(downstream_source_tree) + + upstream_projects = filter_exclusion_list(upstream_projects, exclusion_file) + downstream_projects = filter_exclusion_list(downstream_projects, + exclusion_file) + + project_matches = match_projects(upstream_projects, downstream_projects) + + for match in project_matches: + upstream_project_name = match['upstream'] + downstream_project_name = match['downstream'] + project_stats = get_project_stats( + upstream_projects.get(upstream_project_name, None), + downstream_projects.get(downstream_project_name, None)) + status = '' + if not upstream_project_name: + status = 'Downstream Only Projects' + elif not downstream_project_name: + status = 'Upstream Only Projects' + elif project_stats['file'] == 0: + status = 'Intact Projects' + elif upstream_project_name == downstream_project_name: + status = 'Modified Projects' + else: + status = 'Forked Projects' + + project_stats['status'] = status + project_stats['upstream'] = upstream_project_name + project_stats['downstream'] = downstream_project_name + project_stats['downstream_path'] = downstream_projects.get( + downstream_project_name) + + all_project_stats.append(project_stats) + + return all_project_stats + + +def find_root_commits_in_path(path): + """Returns a list of root commits in a git project path.""" + print('Analyzing history of ' + path) + rev_list = git(['-C', path, 'rev-list', '--max-parents=0', 'HEAD']) + return rev_list.splitlines() + + +def find_root_commits_in_projects(projects): + """Returns a dict of root commits with all projects with that root commit.""" + root_commits = {} + for name, path in projects.iteritems(): + for root in find_root_commits_in_path(path): + root_list = root_commits.get(root, []) + root_list.append({ + 'name': name, + 'path': path, + }) + root_commits[root] = root_list + return root_commits + + +def get_commit_stats_in_project(project): + """Extract commits that have not been upstreamed in a specific project. + + Args: + project: A dict of a project name and path. + + Returns: + A dict of commits not upstreamed. + """ + name = project['name'] + path = project['downstream_path'] + print('Finding commits not upstreamed in ' + name) + commits = git_commits_not_upstreamed.find('FETCH_HEAD', 'HEAD', path) + print('Found commits not upstreamed in ' + name) + stats = [] + for commit in commits: + author = git(['-C', path, 'show', '--no-patch', '--format=%ae', commit]) + author = author.strip() + subject = git(['-C', path, 'show', '--no-patch', '--format=%s', commit]) + subject = subject.strip() + stats.append({ + 'commit': commit, + 'author': author, + 'subject': subject, + }) + + return { + 'name': name, + 'stats': stats, + } + + +def get_all_commits_stats(project_stats): + """Extract commits that have not been upstreamed in all projects. + + Args: + project_stats: A dict of matching upstream and downstream projects + including stats for projects that matches. + + Returns: + A dict of commits not upstreamed. + """ + commit_stats = {} + downstream_stats = {match['downstream']: match for match in project_stats} + + # Only analyze modified projects + modified_projects = [] + for name, stats in downstream_stats.iteritems(): + if stats['status'].startswith('Modified'): + stats['name'] = name + modified_projects.append(stats) + + pool = multiprocessing.Pool() + + commit_stats = pool.map(get_commit_stats_in_project, modified_projects) + + commit_stats = {stats['name']: stats['stats'] for stats in commit_stats} + + return commit_stats + + +def write_commit_csv(commit_stats, commit_output_file): + """Write project comparison data to a CSV file. + + Args: + commit_stats: The dict of the stats for all commits. + commit_output_file: Path to the output file. + """ + with open(commit_output_file, 'w') as f: + fieldnames = [ + 'Date', + 'Commit', + 'Downstream Project', + 'Author', + 'Subject', + ] + today = datetime.datetime.today().strftime('%Y/%m/%d') + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + for project, stats in commit_stats.iteritems(): + for stat in stats: + writer.writerow({ + 'Date': today, + 'Commit': stat['commit'], + 'Downstream Project': project, + 'Author': stat['author'], + 'Subject': stat['subject'], + }) + print('Wrote commit stats to ' + commit_output_file) + + +def write_project_csv(project_stats, commit_stats, project_output_file): + """Write project comparison data to a CSV file. + + Args: + project_stats: The dict of the stats for all projects. + commit_stats: The dict of the stats for all commits. + project_output_file: Path to the output file. + """ + with open(project_output_file, 'w') as f: + fieldnames = [ + 'Date', + 'Downstream Project', + 'Upstream Project', + 'Diff Status', + 'Files Changed', + 'Line Insertions', + 'Line Deletions', + 'Line Changes', + 'Commits Not Upstreamed', + ] + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + today = datetime.datetime.today().strftime('%Y/%m/%d') + for stat in project_stats: + commits_not_upstreamed = 0 + downstream_project = stat['downstream'] + if downstream_project in commit_stats: + commits_not_upstreamed = len(commit_stats[downstream_project]) + writer.writerow({ + 'Date': today, + 'Downstream Project': downstream_project, + 'Upstream Project': stat['upstream'], + 'Diff Status': stat['status'], + 'Files Changed': stat['file'], + 'Line Insertions': stat['insertion'], + 'Line Deletions': stat['deletion'], + 'Line Changes': stat['insertion'] + stat['deletion'], + 'Commits Not Upstreamed': commits_not_upstreamed, + }) + print('Wrote project stats to ' + project_output_file) + + +def diff(upstream_source_tree, downstream_source_tree, project_output_file, + commit_output_file, exclusions_file): + """Diff one repo source tree against another. + + Args: + upstream_source_tree: A string with the path to a gerrit source tree. + downstream_source_tree: A string with the path to a gerrit source tree. + project_output_file: Path to the project output file. + commit_output_file: Path to the commit output file. + exclusions_file: Path to exclusions file. + """ + project_stats = get_all_projects_stats(upstream_source_tree, + downstream_source_tree, + exclusions_file) + commit_stats = get_all_commits_stats(project_stats) + write_commit_csv(commit_stats, commit_output_file) + write_project_csv(project_stats, commit_stats, project_output_file) + + +def main(): + parser = argparse.ArgumentParser( + description='Diff a repo source tree against an upstream source tree.') + parser.add_argument('upstream_path', help='Path to an upstream source tree.') + parser.add_argument( + 'downstream_path', help='Path to a downstream source tree.') + parser.add_argument( + '-p', + '--project_output_file', + help='Path to write the project output file', + default='project.csv',) + parser.add_argument( + '-c', + '--commit_output_file', + help='Path to write the commit output file', + default='commit.csv',) + parser.add_argument( + '-e', + '--exclusions_file', + help='Path to file with a list of project names to be excluded from' + 'the diff. You may use a regular expression to match project names as' + 'described in https://docs.python.org/2/howto/regex.html', + default='', + ) + args = parser.parse_args() + upstream_source_tree = os.path.abspath(args.upstream_path) + downstream_source_tree = os.path.abspath(args.downstream_path) + project_output_file = os.path.abspath(args.project_output_file) + commit_output_file = os.path.abspath(args.commit_output_file) + exclusions_file = '' + if args.exclusions_file: + exclusions_file = os.path.abspath(args.exclusions_file) + + diff(upstream_source_tree, downstream_source_tree, project_output_file, + commit_output_file, exclusions_file) + + +if __name__ == '__main__': + main()