artifact_cache.py - mozsearch

mozilla-central/python/mozbuild/mozbuild/artifact_cache.py (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Firefox Build System :: General

Revision control

Copy as Markdown

Other Tools

# This Source Code Form is subject to the terms of the Mozilla Public

# License, v. 2.0. If a copy of the MPL was not distributed with this

# file, You can obtain one at http://mozilla.org/MPL/2.0/.

"""

Fetch and cache artifacts from URLs.

This module manages fetching artifacts from URLS and purging old

artifacts using a simple Least Recently Used cache.

This module requires certain modules be importable from the ambient Python

environment.  Consumers will need to arrange this themselves.

The bulk of the complexity is in managing and persisting several caches.  If

we found a Python LRU cache that pickled cleanly, we could remove a lot of

this code!  Sadly, I found no such candidate implementations, so we pickle

pylru caches manually.

None of the instances (or the underlying caches) are safe for concurrent use.

A future need, perhaps.

"""

import binascii

import hashlib

import logging

import os

import dlmanager

import mozpack.path as mozpath

import six

import six.moves.urllib.parse as urlparse

from mozbuild.dirutils import mkdir

# Using 'DownloadManager' through the provided interface we

# can't directly specify a 'chunk_size' for the 'Download' it manages.

# One way to get it to use the 'chunk_size' we want is to monkeypatch

# the defaults of the init function for the 'Download' class.

CHUNK_SIZE = 16 * 1024 * 1024  # 16 MB in bytes.

dl_init = dlmanager.Download.__init__

dl_init.__defaults__ = (

    dl_init.__defaults__[:1] + (CHUNK_SIZE,) + dl_init.__defaults__[2:]

# Minimum number of downloaded artifacts to keep. Each artifact can be very large,

# so don't make this to large!

MIN_CACHED_ARTIFACTS = 12

# Maximum size of the downloaded artifacts to keep in cache, in bytes (2GiB).

MAX_CACHED_ARTIFACTS_SIZE = 2 * 1024 * 1024 * 1024

class ArtifactPersistLimit(dlmanager.PersistLimit):

    """Handle persistence for a cache of artifacts.

    When instantiating a DownloadManager, it starts by filling the

    PersistLimit instance it's given with register_dir_content.

    In practice, this registers all the files already in the cache directory.

    After a download finishes, the newly downloaded file is registered, and the

    oldest files registered to the PersistLimit instance are removed depending

    on the size and file limits it's configured for.

    This is all good, but there are a few tweaks we want here:

      - We have pickle files in the cache directory that we don't want purged.

      - Files that were just downloaded in the same session shouldn't be

        purged. (if for some reason we end up downloading more than the default

        max size, we don't want the files to be purged)

    To achieve this, this subclass of PersistLimit inhibits the register_file

    method for pickle files and tracks what files were downloaded in the same

    session to avoid removing them.

    The register_file method may be used to register cache matches too, so that

    later sessions know they were freshly used.

"""

    def __init__(self, log=None):

        super(ArtifactPersistLimit, self).__init__(

            size_limit=MAX_CACHED_ARTIFACTS_SIZE, file_limit=MIN_CACHED_ARTIFACTS

        self._log = log

        self._registering_dir = False

        self._downloaded_now = set()

    def log(self, *args, **kwargs):

        if self._log:

            self._log(*args, **kwargs)

    def register_file(self, path):

        if (

            path.endswith(".pickle")

            or path.endswith(".checksum")

            or os.path.basename(path) == ".metadata_never_index"

):

            return

        if not self._registering_dir:

            # Touch the file so that subsequent calls to a mach artifact

            # command know it was recently used. While remove_old_files

            # is based on access time, in various cases, the access time is not

            # updated when just reading the file, so we force an update.

            try:

                os.utime(path, None)

            except OSError:

                pass

            self._downloaded_now.add(path)

        super(ArtifactPersistLimit, self).register_file(path)

    def register_dir_content(self, directory, pattern="*"):

        self._registering_dir = True

        super(ArtifactPersistLimit, self).register_dir_content(directory, pattern)

        self._registering_dir = False

    def remove_old_files(self):

        from dlmanager import fs

        files = sorted(self.files, key=lambda f: f.stat.st_atime)

        kept = []

        while len(files) > self.file_limit and self._files_size >= self.size_limit:

            f = files.pop(0)

            if f.path in self._downloaded_now:

                kept.append(f)

                continue

            try:

                fs.remove(f.path)

            except WindowsError:

                # For some reason, on automation, we can't remove those files.

                # So for now, ignore the error.

                kept.append(f)

                continue

            self.log(

                logging.INFO,

                "artifact",

                {"filename": f.path},

                "Purged artifact {filename}",

            self._files_size -= f.stat.st_size

        self.files = files + kept

    def remove_all(self):

        from dlmanager import fs

        for f in self.files:

            fs.remove(f.path)

        self._files_size = 0

        self.files = []

class ArtifactCache(object):

    """Fetch artifacts from URLS and purge least recently used artifacts from disk."""

    def __init__(self, cache_dir, log=None, skip_cache=False):

        mkdir(cache_dir, not_indexed=True)

        self._cache_dir = cache_dir

        self._log = log

        self._skip_cache = skip_cache

        self._persist_limit = ArtifactPersistLimit(log)

        self._download_manager = dlmanager.DownloadManager(

            self._cache_dir, persist_limit=self._persist_limit

        self._last_dl_update = -1

    def log(self, *args, **kwargs):

        if self._log:

            self._log(*args, **kwargs)

    def fetch(self, url, force=False):

        fname = os.path.basename(url)

        try:

            # Use the file name from the url if it looks like a hash digest.

            if len(fname) not in (32, 40, 56, 64, 96, 128):

                raise TypeError()

            binascii.unhexlify(fname)

        except (TypeError, binascii.Error):

            # We download to a temporary name like HASH[:16]-basename to

            # differentiate among URLs with the same basenames.  We used to then

            # extract the build ID from the downloaded artifact and use it to make a

            # human readable unique name, but extracting build IDs is time consuming

            # (especially on Mac OS X, where we must mount a large DMG file).

            hash = hashlib.sha256(six.ensure_binary(url)).hexdigest()[:16]

            # Strip query string and fragments.

            basename = os.path.basename(urlparse.urlparse(url).path)

            fname = hash + "-" + basename

        path = os.path.abspath(mozpath.join(self._cache_dir, fname))

        if self._skip_cache and os.path.exists(path):

            self.log(

                logging.INFO,

                "artifact",

                {"path": path},

                "Skipping cache: removing cached downloaded artifact {path}",

            os.remove(path)

        try:

            dl = self._download_manager.download(url, fname)

            def download_progress(dl, bytes_so_far, total_size):

                if not total_size:

                    return

                percent = (float(bytes_so_far) / total_size) * 100

                now = int(percent / 5)

                if now == self._last_dl_update:

                    return

                self._last_dl_update = now

                self.log(

                    logging.INFO,

                    "artifact",

                        "bytes_so_far": bytes_so_far,

                        "total_size": total_size,

                        "percent": percent,

},

                    "Downloading... {percent:02.1f} %",

            if dl:

                self.log(

                    logging.INFO,

                    "artifact",

                    {"path": path},

                    "Downloading artifact to local cache: {path}",

                dl.set_progress(download_progress)

                dl.wait()

            else:

                self.log(

                    logging.INFO,

                    "artifact",

                    {"path": path},

                    "Using artifact from local cache: {path}",

                # Avoid the file being removed if it was in the cache already.

                path = os.path.join(self._cache_dir, fname)

                self._persist_limit.register_file(path)

            return os.path.abspath(mozpath.join(self._cache_dir, fname))

        finally:

            # Cancel any background downloads in progress.

            self._download_manager.cancel()

    def clear_cache(self):

        if self._skip_cache:

            self.log(

                logging.INFO, "artifact", {}, "Skipping cache: ignoring clear_cache!"

            return

        self._persist_limit.remove_all()