sstate-cache-management.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. #!/usr/bin/env python3
  2. #
  3. # Copyright OpenEmbedded Contributors
  4. #
  5. # SPDX-License-Identifier: MIT
  6. #
  7. import argparse
  8. import os
  9. import re
  10. import sys
  11. from collections import defaultdict
  12. from concurrent.futures import ThreadPoolExecutor
  13. from dataclasses import dataclass
  14. from pathlib import Path
  15. if sys.version_info < (3, 8, 0):
  16. raise RuntimeError("Sorry, python 3.8.0 or later is required for this script.")
  17. SSTATE_PREFIX = "sstate:"
  18. SSTATE_EXTENSION = ".tar.zst"
  19. # SSTATE_EXTENSION = ".tgz"
  20. # .siginfo.done files are mentioned in the original script?
  21. SSTATE_SUFFIXES = (
  22. SSTATE_EXTENSION,
  23. f"{SSTATE_EXTENSION}.siginfo",
  24. f"{SSTATE_EXTENSION}.done",
  25. )
  26. RE_SSTATE_PKGSPEC = re.compile(
  27. rf"""sstate:(?P<pn>[^:]*):
  28. (?P<package_target>[^:]*):
  29. (?P<pv>[^:]*):
  30. (?P<pr>[^:]*):
  31. (?P<sstate_pkgarch>[^:]*):
  32. (?P<sstate_version>[^_]*):
  33. (?P<bb_unihash>[^_]*)_
  34. (?P<bb_task>[^:]*)
  35. (?P<ext>({"|".join([re.escape(s) for s in SSTATE_SUFFIXES])}))$""",
  36. re.X,
  37. )
  38. # Really we'd like something like a Path subclass which implements a stat
  39. # cache here, unfortunately there's no good way to do that transparently
  40. # (yet); see:
  41. #
  42. # https://github.com/python/cpython/issues/70219
  43. # https://discuss.python.org/t/make-pathlib-extensible/3428/77
  44. @dataclass
  45. class SstateEntry:
  46. """Class for keeping track of an entry in sstate-cache."""
  47. path: Path
  48. match: re.Match
  49. stat_result: os.stat_result = None
  50. def __hash__(self):
  51. return self.path.__hash__()
  52. def __getattr__(self, name):
  53. return self.match.group(name)
  54. # this is what's in the original script; as far as I can tell, it's an
  55. # implementation artefact which we don't need?
  56. def find_archs():
  57. # all_archs
  58. builder_arch = os.uname().machine
  59. # FIXME
  60. layer_paths = [Path("../..")]
  61. tune_archs = set()
  62. re_tune = re.compile(r'AVAILTUNES .*=.*"(.*)"')
  63. for path in layer_paths:
  64. for tunefile in [
  65. p for p in path.glob("meta*/conf/machine/include/**/*") if p.is_file()
  66. ]:
  67. with open(tunefile) as f:
  68. for line in f:
  69. m = re_tune.match(line)
  70. if m:
  71. tune_archs.update(m.group(1).split())
  72. # all_machines
  73. machine_archs = set()
  74. for path in layer_paths:
  75. for machine_file in path.glob("meta*/conf/machine/*.conf"):
  76. machine_archs.add(machine_file.parts[-1][:-5])
  77. extra_archs = set()
  78. all_archs = (
  79. set(
  80. arch.replace("-", "_")
  81. for arch in machine_archs | tune_archs | set(["allarch", builder_arch])
  82. )
  83. | extra_archs
  84. )
  85. print(all_archs)
  86. # again, not needed?
  87. def find_tasks():
  88. print(set([p.bb_task for p in paths]))
  89. def collect_sstate_paths(args):
  90. def scandir(path, paths):
  91. # Assume everything is a directory; by not checking we avoid needing an
  92. # additional stat which is potentially a synchronous roundtrip over NFS
  93. try:
  94. for p in path.iterdir():
  95. filename = p.parts[-1]
  96. if filename.startswith(SSTATE_PREFIX):
  97. if filename.endswith(SSTATE_SUFFIXES):
  98. m = RE_SSTATE_PKGSPEC.match(p.parts[-1])
  99. assert m
  100. paths.add(SstateEntry(p, m))
  101. # ignore other things (includes things like lockfiles)
  102. else:
  103. scandir(p, paths)
  104. except NotADirectoryError:
  105. pass
  106. paths = set()
  107. # TODO: parellise scandir
  108. scandir(Path(args.cache_dir), paths)
  109. def path_stat(p):
  110. p.stat_result = p.path.lstat()
  111. if args.remove_duplicated:
  112. # This is probably slightly performance negative on a local filesystem
  113. # when we interact with the GIL; over NFS it's a massive win.
  114. with ThreadPoolExecutor(max_workers=args.jobs) as executor:
  115. executor.map(path_stat, paths)
  116. return paths
  117. def remove_by_stamps(args, paths):
  118. all_sums = set()
  119. for stamps_dir in args.stamps_dir:
  120. stamps_path = Path(stamps_dir)
  121. assert stamps_path.is_dir()
  122. re_sigdata = re.compile(r"do_.*\.sigdata\.([^.]*)")
  123. all_sums |= set(
  124. [
  125. re_sigdata.search(x.parts[-1]).group(1)
  126. for x in stamps_path.glob("*/*/*.do_*.sigdata.*")
  127. ]
  128. )
  129. re_setscene = re.compile(r"do_.*_setscene\.([^.]*)")
  130. all_sums |= set(
  131. [
  132. re_setscene.search(x.parts[-1]).group(1)
  133. for x in stamps_path.glob("*/*/*.do_*_setscene.*")
  134. ]
  135. )
  136. return [p for p in paths if p.bb_unihash not in all_sums]
  137. def remove_duplicated(args, paths):
  138. # Skip populate_lic as it produces duplicates in a normal build
  139. #
  140. # 9ae16469e707 sstate-cache-management: skip populate_lic archives when removing duplicates
  141. valid_paths = [p for p in paths if p.bb_task != "populate_lic"]
  142. keep = dict()
  143. remove = list()
  144. for p in valid_paths:
  145. sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task, p.ext])
  146. if sstate_sig not in keep:
  147. keep[sstate_sig] = p
  148. elif p.stat_result.st_mtime > keep[sstate_sig].stat_result.st_mtime:
  149. remove.append(keep[sstate_sig])
  150. keep[sstate_sig] = p
  151. else:
  152. remove.append(p)
  153. return remove
  154. def remove_orphans(args, paths):
  155. remove = list()
  156. pathsigs = defaultdict(list)
  157. for p in paths:
  158. sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task])
  159. pathsigs[sstate_sig].append(p)
  160. for k, v in pathsigs.items():
  161. if len([p for p in v if p.ext == SSTATE_EXTENSION]) == 0:
  162. remove.extend(v)
  163. return remove
  164. def parse_arguments():
  165. parser = argparse.ArgumentParser(description="sstate cache management utility.")
  166. parser.add_argument(
  167. "--cache-dir",
  168. default=os.environ.get("SSTATE_CACHE_DIR"),
  169. help="""Specify sstate cache directory, will use the environment
  170. variable SSTATE_CACHE_DIR if it is not specified.""",
  171. )
  172. # parser.add_argument(
  173. # "--extra-archs",
  174. # help="""Specify list of architectures which should be tested, this list
  175. # will be extended with native arch, allarch and empty arch. The
  176. # script won't be trying to generate list of available archs from
  177. # AVAILTUNES in tune files.""",
  178. # )
  179. # parser.add_argument(
  180. # "--extra-layer",
  181. # help="""Specify the layer which will be used for searching the archs,
  182. # it will search the meta and meta-* layers in the top dir by
  183. # default, and will search meta, meta-*, <layer1>, <layer2>,
  184. # ...<layern> when specified. Use "," as the separator.
  185. #
  186. # This is useless for --stamps-dir or when --extra-archs is used.""",
  187. # )
  188. parser.add_argument(
  189. "-d",
  190. "--remove-duplicated",
  191. action="store_true",
  192. help="""Remove the duplicated sstate cache files of one package, only
  193. the newest one will be kept. The duplicated sstate cache files
  194. of one package must have the same arch, which means sstate cache
  195. files with multiple archs are not considered duplicate.
  196. Conflicts with --stamps-dir.""",
  197. )
  198. parser.add_argument(
  199. "--remove-orphans",
  200. action="store_true",
  201. help=f"""Remove orphan siginfo files from the sstate cache, i.e. those
  202. where this is no {SSTATE_EXTENSION} file but there are associated
  203. tracking files.""",
  204. )
  205. parser.add_argument(
  206. "--stamps-dir",
  207. action="append",
  208. help="""Specify the build directory's stamps directories, the sstate
  209. cache file which IS USED by these build diretories will be KEPT,
  210. other sstate cache files in cache-dir will be removed. Can be
  211. specified multiple times for several directories.
  212. Conflicts with --remove-duplicated.""",
  213. )
  214. parser.add_argument(
  215. "-j", "--jobs", default=8, type=int, help="Run JOBS jobs in parallel."
  216. )
  217. # parser.add_argument(
  218. # "-L",
  219. # "--follow-symlink",
  220. # action="store_true",
  221. # help="Remove both the symbol link and the destination file, default: no.",
  222. # )
  223. parser.add_argument(
  224. "-n", "--dry-run", action="store_true", help="Don't execute, just go through the motions."
  225. )
  226. parser.add_argument(
  227. "-y",
  228. "--yes",
  229. action="store_true",
  230. help="""Automatic yes to prompts; assume "yes" as answer to all prompts
  231. and run non-interactively.""",
  232. )
  233. parser.add_argument(
  234. "-v", "--verbose", action="store_true", help="Explain what is being done."
  235. )
  236. parser.add_argument(
  237. "-D",
  238. "--debug",
  239. action="count",
  240. default=0,
  241. help="Show debug info, repeat for more debug info.",
  242. )
  243. args = parser.parse_args()
  244. if args.cache_dir is None or (
  245. not args.remove_duplicated and not args.stamps_dir and not args.remove_orphans
  246. ):
  247. parser.print_usage()
  248. sys.exit(1)
  249. return args
  250. def main():
  251. args = parse_arguments()
  252. paths = collect_sstate_paths(args)
  253. if args.remove_duplicated:
  254. remove = remove_duplicated(args, paths)
  255. elif args.stamps_dir:
  256. remove = remove_by_stamps(args, paths)
  257. else:
  258. remove = list()
  259. if args.remove_orphans:
  260. remove = set(remove) | set(remove_orphans(args, paths))
  261. if args.debug >= 1:
  262. print("\n".join([str(p.path) for p in remove]))
  263. print(f"{len(remove)} out of {len(paths)} files will be removed!")
  264. if args.dry_run:
  265. return 0
  266. if not args.yes:
  267. print("Do you want to continue (y/n)?")
  268. confirm = input() in ("y", "Y")
  269. else:
  270. confirm = True
  271. if confirm:
  272. # TODO: parallelise remove
  273. for p in remove:
  274. p.path.unlink()
  275. if __name__ == "__main__":
  276. main()