Browse Source

sstate-cache-management: Rewrite in python

This (should be) a drop in replacement for sstate-cache-management.sh.

(From OE-Core rev: 2fa1b25d7485bfbb92bcc33067beb6751218b36a)

Signed-off-by: Alex Kiernan <alex.kiernan@gmail.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Alex Kiernan 1 year ago
parent
commit
b723fcaac5
1 changed files with 329 additions and 0 deletions
  1. 329 0
      scripts/sstate-cache-management.py

+ 329 - 0
scripts/sstate-cache-management.py

@@ -0,0 +1,329 @@
+#!/usr/bin/env python3
+#
+# Copyright OpenEmbedded Contributors
+#
+# SPDX-License-Identifier: MIT
+#
+
+import argparse
+import os
+import re
+import sys
+
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from pathlib import Path
+
+if sys.version_info < (3, 8, 0):
+    raise RuntimeError("Sorry, python 3.8.0 or later is required for this script.")
+
+SSTATE_PREFIX = "sstate:"
+SSTATE_EXTENSION = ".tar.zst"
+# SSTATE_EXTENSION = ".tgz"
+# .siginfo.done files are mentioned in the original script?
+SSTATE_SUFFIXES = (
+    SSTATE_EXTENSION,
+    f"{SSTATE_EXTENSION}.siginfo",
+    f"{SSTATE_EXTENSION}.done",
+)
+
+RE_SSTATE_PKGSPEC = re.compile(
+    rf"""sstate:(?P<pn>[^:]*):
+         (?P<package_target>[^:]*):
+         (?P<pv>[^:]*):
+         (?P<pr>[^:]*):
+         (?P<sstate_pkgarch>[^:]*):
+         (?P<sstate_version>[^_]*):
+         (?P<bb_unihash>[^_]*)_
+         (?P<bb_task>[^:]*)
+         (?P<ext>({"|".join([re.escape(s) for s in SSTATE_SUFFIXES])}))$""",
+    re.X,
+)
+
+
+# Really we'd like something like a Path subclass which implements a stat
+# cache here, unfortunately there's no good way to do that transparently
+# (yet); see:
+#
+# https://github.com/python/cpython/issues/70219
+# https://discuss.python.org/t/make-pathlib-extensible/3428/77
+@dataclass
+class SstateEntry:
+    """Class for keeping track of an entry in sstate-cache."""
+
+    path: Path
+    match: re.Match
+    stat_result: os.stat_result = None
+
+    def __hash__(self):
+        return self.path.__hash__()
+
+    def __getattr__(self, name):
+        return self.match.group(name)
+
+
+# this is what's in the original script; as far as I can tell, it's an
+# implementation artefact which we don't need?
+def find_archs():
+    # all_archs
+    builder_arch = os.uname().machine
+
+    # FIXME
+    layer_paths = [Path("../..")]
+
+    tune_archs = set()
+    re_tune = re.compile(r'AVAILTUNES .*=.*"(.*)"')
+    for path in layer_paths:
+        for tunefile in [
+            p for p in path.glob("meta*/conf/machine/include/**/*") if p.is_file()
+        ]:
+            with open(tunefile) as f:
+                for line in f:
+                    m = re_tune.match(line)
+                    if m:
+                        tune_archs.update(m.group(1).split())
+
+    # all_machines
+    machine_archs = set()
+    for path in layer_paths:
+        for machine_file in path.glob("meta*/conf/machine/*.conf"):
+            machine_archs.add(machine_file.parts[-1][:-5])
+
+    extra_archs = set()
+    all_archs = (
+        set(
+            arch.replace("-", "_")
+            for arch in machine_archs | tune_archs | set(["allarch", builder_arch])
+        )
+        | extra_archs
+    )
+
+    print(all_archs)
+
+
+# again, not needed?
+def find_tasks():
+    print(set([p.bb_task for p in paths]))
+
+
+def collect_sstate_paths(args):
+    def scandir(path, paths):
+        # Assume everything is a directory; by not checking we avoid needing an
+        # additional stat which is potentially a synchronous roundtrip over NFS
+        try:
+            for p in path.iterdir():
+                filename = p.parts[-1]
+                if filename.startswith(SSTATE_PREFIX):
+                    if filename.endswith(SSTATE_SUFFIXES):
+                        m = RE_SSTATE_PKGSPEC.match(p.parts[-1])
+                        assert m
+                        paths.add(SstateEntry(p, m))
+                    # ignore other things (includes things like lockfiles)
+                else:
+                    scandir(p, paths)
+
+        except NotADirectoryError:
+            pass
+
+    paths = set()
+    # TODO: parellise scandir
+    scandir(Path(args.cache_dir), paths)
+
+    def path_stat(p):
+        p.stat_result = p.path.lstat()
+
+    if args.remove_duplicated:
+        # This is probably slightly performance negative on a local filesystem
+        # when we interact with the GIL; over NFS it's a massive win.
+        with ThreadPoolExecutor(max_workers=args.jobs) as executor:
+            executor.map(path_stat, paths)
+
+    return paths
+
+
+def remove_by_stamps(args, paths):
+    all_sums = set()
+    for stamps_dir in args.stamps_dir:
+        stamps_path = Path(stamps_dir)
+        assert stamps_path.is_dir()
+        re_sigdata = re.compile(r"do_.*.sigdata\.([^.]*)")
+        all_sums |= set(
+            [
+                re_sigdata.search(x.parts[-1]).group(1)
+                for x in stamps_path.glob("*/*/*.do_*.sigdata.*")
+            ]
+        )
+        re_setscene = re.compile(r"do_.*_setscene\.([^.]*)")
+        all_sums |= set(
+            [
+                re_setscene.search(x.parts[-1]).group(1)
+                for x in stamps_path.glob("*/*/*.do_*_setscene.*")
+            ]
+        )
+    return [p for p in paths if p.bb_unihash not in all_sums]
+
+
+def remove_duplicated(args, paths):
+    # Skip populate_lic as it produces duplicates in a normal build
+    #
+    # 9ae16469e707 sstate-cache-management: skip populate_lic archives when removing duplicates
+    valid_paths = [p for p in paths if p.bb_task != "populate_lic"]
+
+    keep = dict()
+    remove = list()
+    for p in valid_paths:
+        sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task, p.ext])
+        if sstate_sig not in keep:
+            keep[sstate_sig] = p
+        elif p.stat_result.st_mtime > keep[sstate_sig].stat_result.st_mtime:
+            remove.append(keep[sstate_sig])
+            keep[sstate_sig] = p
+        else:
+            remove.append(p)
+
+    return remove
+
+
+def remove_orphans(args, paths):
+    remove = list()
+    pathsigs = defaultdict(list)
+    for p in paths:
+        sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task])
+        pathsigs[sstate_sig].append(p)
+    for k, v in pathsigs.items():
+        if len([p for p in v if p.ext == SSTATE_EXTENSION]) == 0:
+            remove.extend(v)
+    return remove
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="sstate cache management utility.")
+
+    parser.add_argument(
+        "--cache-dir",
+        default=os.environ.get("SSTATE_CACHE_DIR"),
+        help="""Specify sstate cache directory, will use the environment
+            variable SSTATE_CACHE_DIR if it is not specified.""",
+    )
+
+    # parser.add_argument(
+    #     "--extra-archs",
+    #     help="""Specify list of architectures which should be tested, this list
+    #         will be extended with native arch, allarch and empty arch. The
+    #         script won't be trying to generate list of available archs from
+    #         AVAILTUNES in tune files.""",
+    # )
+
+    # parser.add_argument(
+    #     "--extra-layer",
+    #     help="""Specify the layer which will be used for searching the archs,
+    #         it will search the meta and meta-* layers in the top dir by
+    #         default, and will search meta, meta-*, <layer1>, <layer2>,
+    #         ...<layern> when specified. Use "," as the separator.
+    #
+    #         This is useless for --stamps-dir or when --extra-archs is used.""",
+    # )
+
+    parser.add_argument(
+        "-d",
+        "--remove-duplicated",
+        action="store_true",
+        help="""Remove the duplicated sstate cache files of one package, only
+            the newest one will be kept. The duplicated sstate cache files
+            of one package must have the same arch, which means sstate cache
+            files with multiple archs are not considered duplicate.
+
+            Conflicts with --stamps-dir.""",
+    )
+
+    parser.add_argument(
+        "--remove-orphans",
+        action="store_true",
+        help=f"""Remove orphan siginfo files from the sstate cache, i.e. those
+            where this is no {SSTATE_EXTENSION} file but there are associated
+            tracking files.""",
+    )
+
+    parser.add_argument(
+        "--stamps-dir",
+        action="append",
+        help="""Specify the build directory's stamps directories, the sstate
+            cache file which IS USED by these build diretories will be KEPT,
+            other sstate cache files in cache-dir will be removed. Can be
+            specified multiple times for several directories.
+
+            Conflicts with --remove-duplicated.""",
+    )
+
+    parser.add_argument(
+        "-j", "--jobs", default=8, type=int, help="Run JOBS jobs in parallel."
+    )
+
+    # parser.add_argument(
+    #     "-L",
+    #     "--follow-symlink",
+    #     action="store_true",
+    #     help="Remove both the symbol link and the destination file, default: no.",
+    # )
+
+    parser.add_argument(
+        "-y",
+        "--yes",
+        action="store_true",
+        help="""Automatic yes to prompts; assume "yes" as answer to all prompts
+            and run non-interactively.""",
+    )
+
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="Explain what is being done."
+    )
+
+    parser.add_argument(
+        "-D",
+        "--debug",
+        action="count",
+        default=0,
+        help="Show debug info, repeat for more debug info.",
+    )
+
+    args = parser.parse_args()
+    if args.cache_dir is None or (
+        not args.remove_duplicated and not args.stamps_dir and not args.remove_orphans
+    ):
+        parser.print_usage()
+        sys.exit(1)
+
+    return args
+
+
+def main():
+    args = parse_arguments()
+
+    paths = collect_sstate_paths(args)
+    if args.remove_duplicated:
+        remove = remove_duplicated(args, paths)
+    elif args.stamps_dir:
+        remove = remove_by_stamps(args, paths)
+    else:
+        remove = list()
+
+    if args.remove_orphans:
+        remove = set(remove) | set(remove_orphans(args, paths))
+
+    if args.debug >= 1:
+        print("\n".join([str(p.path) for p in remove]))
+    print(f"{len(remove)} out of {len(paths)} files will be removed!")
+    if not args.yes:
+        print("Do you want to continue (y/n)?")
+        confirm = input() in ("y", "Y")
+    else:
+        confirm = True
+    if confirm:
+        # TODO: parallelise remove
+        for p in remove:
+            p.path.unlink()
+
+
+if __name__ == "__main__":
+    main()