mrjob/cloud.py

# -*- coding: utf-8 -*-
# Copyright 2017-2018 Yelp
# Copyright 2019 Yelp
# Copyright 2020 Affirm, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import pipes
import socket
import random
import signal
import time
from copy import deepcopy
from subprocess import Popen
from subprocess import PIPE

from mrjob.bin import MRJobBinRunner
from mrjob.bin import _unarchive_cmd
from mrjob.conf import combine_dicts
from mrjob.py2 import integer_types
from mrjob.py2 import xrange
from mrjob.setup import WorkingDirManager
from mrjob.setup import parse_setup_cmd
from mrjob.util import cmd_line

log = logging.getLogger(__name__)

# don't try to bind SSH tunnel to more than this many local ports
_MAX_SSH_RETRIES = 20

# issue a warning if max_mins_idle is set to less than this
_DEFAULT_MAX_MINS_IDLE = 10.0

# default part size (so we can share with Spark runner)
_DEFAULT_CLOUD_PART_SIZE_MB = 100


class HadoopInTheCloudJobRunner(MRJobBinRunner):
    """Abstract base class for all Hadoop-in-the-cloud services."""

    alias = '_cloud'

    OPT_NAMES = MRJobBinRunner.OPT_NAMES | {
        'bootstrap',
        'bootstrap_python',
        'check_cluster_every',
        'cloud_fs_sync_secs',
        'cloud_part_size_mb',
        'cloud_tmp_dir',
        'cluster_id',
        'core_instance_type',
        'extra_cluster_params',
        'hadoop_streaming_jar',
        'image_id',
        'image_version',
        'instance_type',
        'master_instance_type',
        'max_mins_idle',
        'num_core_instances',
        'num_task_instances',
        'region',
        'ssh_bind_ports',
        'ssh_tunnel',
        'ssh_tunnel_is_open',
        'task_instance_type',
        'zone',
    }

    def __init__(self, **kwargs):
        super(HadoopInTheCloudJobRunner, self).__init__(**kwargs)

        # if *cluster_id* is not set, ``self._cluster_id`` will be
        # set when we create or join a cluster
        self._cluster_id = self._opts['cluster_id']

        # bootstrapping
        self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()

        # add files to manager
        self._bootstrap_dir_mgr = WorkingDirManager()

        for cmd in self._bootstrap:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archive tokens to archives
                    if token['type'] == 'dir':
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'

                    self._bootstrap_dir_mgr.add(**token)

        # we'll create this script later, as needed
        self._master_bootstrap_script_path = None

        # ssh state

        # the process for the SSH tunnel
        self._ssh_proc = None

        # if this is true, stop trying to launch the SSH tunnel
        self._give_up_on_ssh_tunnel = False

        # store the (tunneled) URL of the job tracker/resource manager
        self._ssh_tunnel_url = None

    ### Options ###

    @classmethod
    def _default_opts(cls):
        return combine_dicts(
            super(HadoopInTheCloudJobRunner, cls)._default_opts(),
            dict(
                cloud_part_size_mb=_DEFAULT_CLOUD_PART_SIZE_MB,
                max_mins_idle=_DEFAULT_MAX_MINS_IDLE,
                # don't use a list because it makes it hard to read option
                # values when running in verbose mode. See #1284
                ssh_bind_ports=xrange(40001, 40841),
                ssh_tunnel=False,
                ssh_tunnel_is_open=False,
                # ssh_bin isn't included here. For example, the Dataproc
                # runner launches ssh through the gcloud util
            ),
        )

    def _fix_opts(self, opts, source=None):
        opts = super(HadoopInTheCloudJobRunner, self)._fix_opts(
            opts, source=source)

        # cloud_part_size_mb should be a number
        if opts.get('cloud_part_size_mb') is not None:
            if not isinstance(opts['cloud_part_size_mb'],
                              (integer_types, float)):
                raise TypeError('cloud_part_size_mb must be a number')

        return opts

    def _combine_opts(self, opt_list):
        """Propagate *instance_type* to other instance type opts, if not
        already set.

        Also propagate core instance type to task instance type, if it's
        not already set.
        """
        opts = super(HadoopInTheCloudJobRunner, self)._combine_opts(opt_list)

        if opts['instance_type']:
            # figure out how late in the configs opt was set (setting
            # --instance_type on the command line overrides core_instance_type
            # set in configs)
            opt_priority = {k: -1 for k in opts}

            for i, sub_opts in enumerate(opt_list):
                for k, v in sub_opts.items():
                    if v == opts[k]:
                        opt_priority[k] = i

            # instance_type only affects master_instance_type if there are
            # no other instances
            if opts['num_core_instances'] or opts['num_task_instances']:
                propagate_to = ['core_instance_type', 'task_instance_type']
            else:
                propagate_to = ['master_instance_type']

            for k in propagate_to:
                if opts[k] is None or (
                        opt_priority[k] < opt_priority['instance_type']):
                    opts[k] = opts['instance_type']

        if not opts['task_instance_type']:
            opts['task_instance_type'] = opts['core_instance_type']

        return opts

    ### Bootstrapping ###

    def _bootstrap_python(self):
        """Redefine this to return a (possibly empty) list of parsed commands
        (in the same format as returned by parse_setup_cmd())' to make sure a
        compatible version of Python is installed

        If the *bootstrap_python* option is false, should always return ``[]``.
        """
        return []

    def _cp_to_local_cmd(self):
        """Command to copy files from the cloud to the local directory
        (usually via Hadoop). Redefine this as needed; for example, on EMR,
        we sometimes have to use ``aws s3 cp`` because ``hadoop`` isn't
        installed at bootstrap time."""
        return 'hadoop fs -copyToLocal'

    def _parse_bootstrap(self):
        """Parse the *bootstrap* option with
        :py:func:`mrjob.setup.parse_setup_cmd()`.
        """
        return [parse_setup_cmd(cmd) for cmd in self._opts['bootstrap']]

    def _create_master_bootstrap_script_if_needed(self):
        """Helper for :py:meth:`_add_bootstrap_files_for_upload`.

        Create the master bootstrap script and write it into our local
        temp directory. Set self._master_bootstrap_script_path.

        This will do nothing if there are no bootstrap scripts or commands,
        or if it has already been called."""
        if self._master_bootstrap_script_path:
            return

        # don't bother if we're not starting a cluster
        if self._cluster_id:
            return

        # Also don't bother if we're not bootstrapping
        if not self._bootstrap:
            return

        path = os.path.join(self._get_local_tmp_dir(), 'b.sh')
        log.info('writing master bootstrap script to %s' % path)

        contents = self._master_bootstrap_script_content(
            self._bootstrap)

        self._write_script(contents, path, 'master bootstrap script')

        self._master_bootstrap_script_path = path

    def _master_bootstrap_script_content(self, bootstrap):
        """Return a list containing the lines of the master bootstrap script.
        (without trailing newlines)
        """
        out = []

        # shebang, precommands
        out.extend(self._start_of_sh_script())
        out.append('')

        # for example, create a tmp dir and cd to it
        if self._bootstrap_pre_commands():
            out.extend(self._bootstrap_pre_commands())
            out.append('')

        # store $PWD
        out.append('# store $PWD')
        out.append('__mrjob_PWD=$PWD')
        out.append('')

        # special case for PWD being in /, which happens on Dataproc
        # (really we should cd to tmp or something)
        out.append('if [ $__mrjob_PWD = "/" ]; then')
        out.append('  __mrjob_PWD=""')
        out.append('fi')
        out.append('')

        # run commands in a block so we can redirect stdout to stderr
        # (e.g. to catch errors from compileall). See #370
        out.append('{')

        # download files
        out.append('  # download files and mark them executable')

        cp_to_local = self._cp_to_local_cmd()

        # TODO: why bother with $__mrjob_PWD here, since we're already in it?
        for name, path in sorted(
                self._bootstrap_dir_mgr.name_to_path('file').items()):
            uri = self._upload_mgr.uri(path)
            out.append('  %s %s $__mrjob_PWD/%s' %
                       (cp_to_local, pipes.quote(uri), pipes.quote(name)))
            # imitate Hadoop Distributed Cache (see #1602)
            out.append('  chmod u+rx $__mrjob_PWD/%s' % pipes.quote(name))
            out.append('')

        # download and unarchive archives
        archive_names_and_paths = sorted(
            self._bootstrap_dir_mgr.name_to_path('archive').items())
        if archive_names_and_paths:
            # make tmp dir if needed
            out.append('  # download and unpack archives')
            out.append('  __mrjob_TMP=$(mktemp -d)')
            out.append('')

            for name, path in archive_names_and_paths:
                uri = self._upload_mgr.uri(path)

                archive_file_name = self._bootstrap_dir_mgr.name(
                    'archive_file', path)

                # copy file to tmp dir
                quoted_archive_path = '$__mrjob_TMP/%s' % pipes.quote(
                    archive_file_name)

                out.append('  %s %s %s' % (
                    cp_to_local, pipes.quote(uri), quoted_archive_path))

                out.append('  ' + _unarchive_cmd(path) % dict(
                    file=quoted_archive_path,
                    dir='$__mrjob_PWD/' + pipes.quote(name)))

                # imitate Hadoop Distributed Cache (see #1602)
                out.append(
                    '  chmod u+rx -R $__mrjob_PWD/%s' % pipes.quote(name))

                out.append('')

        # run bootstrap commands
        out.append('  # bootstrap commands')
        for cmd in bootstrap:
            # reconstruct the command line, substituting $__mrjob_PWD/<name>
            # for path dicts
            line = '  '
            for token in cmd:
                if isinstance(token, dict):
                    # it's a path dictionary
                    line += '$__mrjob_PWD/'
                    line += pipes.quote(self._bootstrap_dir_mgr.name(**token))
                else:
                    # it's raw script
                    line += token
            out.append(line)

        out.append('} 1>&2')  # stdout -> stderr for ease of error log parsing

        return out

    def _bootstrap_pre_commands(self):
        """A list of hard-coded commands to run at the beginning of the
        bootstrap script. Currently used by dataproc to cd into a tmp dir."""
        return []

    def _start_of_sh_script(self):
        """Return a list of lines (without trailing newlines) containing the
        shell script shebang and pre-commands."""
        out = []

        # shebang
        sh_bin = self._sh_bin()
        if not sh_bin[0].startswith('/'):
            sh_bin = ['/usr/bin/env'] + sh_bin
        out.append('#!' + cmd_line(sh_bin))

        # hook for 'set -e', etc. (see #1549)
        out.extend(self._sh_pre_commands())

        return out

    ### Launching Clusters ###

    def _add_extra_cluster_params(self, params):
        """Return a dict with the *extra_cluster_params* opt patched into
        *params*, and ``None`` values removed."""
        params = deepcopy(params)

        for k, v in sorted(self._opts['extra_cluster_params'].items()):
            _patch_params(params, k, v)

        return params

    ### SSH Tunnel ###

    def _ssh_tunnel_args(self, bind_port):
        """Redefine this in your subclass. You will probably want to call
        :py:meth:`_ssh_tunnel_opts` somewhere in here.

        Should return the list of args used to run the command
        to open the SSH tunnel, bound to *bind_port* on your computer,
        or ``None`` if it isn't possible to set up an SSH tunnel.
        """
        return None

    def _ssh_tunnel_config(self):
        """Redefine this in your subclass. Should return a dict with the
        following keys:

        *localhost*: once we SSH in, is the web interface?
                     reachable at ``localhost``
        *name*: either ``'job tracker'`` or ``'resource manager'``
        *path*: path of main page on web interface (e.g. "/cluster")
        *port*: port number of the web interface
        """
        raise NotImplementedError

    def _launch_ssh_proc(self, args):
        """The command used to create a :py:class:`subprocess.Popen` to
        run the SSH tunnel. You usually don't need to redefine this."""
        log.debug('> %s' % cmd_line(args))
        return Popen(args, stdin=PIPE, stdout=PIPE, stderr=PIPE)

    def _ssh_launch_wait_secs(self):
        """Wait this long after launching the SSH process before checking
        for failure (default 1 second). You may redefine this."""
        return 1.0

    def _set_up_ssh_tunnel(self):
        """Call this whenever you think it is possible to SSH to your cluster.
        This sets :py:attr:`_ssh_proc`. Does nothing if :mrjob-opt:`ssh_tunnel`
        is not set, or there is already a tunnel process running.
        """
        # did the user request an SSH tunnel?
        if not self._opts['ssh_tunnel']:
            return

        # no point in trying to launch a nonexistent command twice
        if self._give_up_on_ssh_tunnel:
            return

        # did we already launch the SSH tunnel process? is it still running?
        if self._ssh_proc:
            self._ssh_proc.poll()
            if self._ssh_proc.returncode is None:
                return
            else:
                log.warning('  Oops, ssh subprocess exited with return code'
                            ' %d, restarting...' % self._ssh_proc.returncode)
                self._ssh_proc = None

        tunnel_config = self._ssh_tunnel_config()

        bind_port = None
        popen_exception = None
        ssh_tunnel_args = []

        for bind_port in self._pick_ssh_bind_ports():
            ssh_proc = None
            ssh_tunnel_args = self._ssh_tunnel_args(bind_port)

            # can't launch SSH tunnel right now
            if not ssh_tunnel_args:
                return

            try:
                ssh_proc = self._launch_ssh_proc(ssh_tunnel_args)
            except OSError as ex:
                # e.g. OSError(2, 'File not found')
                popen_exception = ex   # warning handled below
                break

            if ssh_proc:
                time.sleep(self._ssh_launch_wait_secs())

                ssh_proc.poll()
                # still running. We are golden
                if ssh_proc.returncode is None:
                    self._ssh_proc = ssh_proc
                    break
                else:
                    ssh_proc.stdin.close()
                    ssh_proc.stdout.close()
                    ssh_proc.stderr.close()

        if self._ssh_proc:
            if self._opts['ssh_tunnel_is_open']:
                bind_host = socket.getfqdn()
            else:
                bind_host = 'localhost'
            self._ssh_tunnel_url = 'http://%s:%d%s' % (
                bind_host, bind_port, tunnel_config['path'])
            log.info('  Connect to %s at: %s' % (
                tunnel_config['name'], self._ssh_tunnel_url))

        else:
            if popen_exception:
                # this only happens if the ssh binary is not present
                # or not executable (so tunnel_config and the args to the
                # ssh binary don't matter)
                log.warning(
                    "    Couldn't open SSH tunnel: %s" % popen_exception)
                self._give_up_on_ssh_tunnel = True
                return
            else:
                log.warning(
                    '    Failed to open ssh tunnel to %s' %
                    tunnel_config['name'])

    def _kill_ssh_tunnel(self):
        """Send SIGKILL to SSH tunnel, if it's running."""
        if not self._ssh_proc:
            return

        self._ssh_proc.poll()
        if self._ssh_proc.returncode is None:
            log.info('Killing our SSH tunnel (pid %d)' %
                     self._ssh_proc.pid)

            self._ssh_proc.stdin.close()
            self._ssh_proc.stdout.close()
            self._ssh_proc.stderr.close()

            try:
                if hasattr(signal, 'SIGKILL'):
                    os.kill(self._ssh_proc.pid, signal.SIGKILL)
                else:
                    # Windows doesn't have SIGKILL, see #1892
                    os.kill(self._ssh_proc.pid, signal.SIGABRT)
            except Exception as e:
                log.exception(e)

        self._ssh_proc = None
        self._ssh_tunnel_url = None

    def _ssh_tunnel_opts(self, bind_port):
        """Options to SSH related to setting up a tunnel (rather than
        SSHing in). Helper for :py:meth:`_ssh_tunnel_args`.
        """
        args = self._ssh_local_tunnel_opt(bind_port) + [
            '-N', '-n', '-q',
        ]
        if self._opts['ssh_tunnel_is_open']:
            args.extend(['-g', '-4'])  # -4: listen on IPv4 only

        return args

    def _ssh_local_tunnel_opt(self, bind_port):
        """Helper for :py:meth:`_ssh_tunnel_opts`."""
        tunnel_config = self._ssh_tunnel_config()

        return [
            '-L', '%d:%s:%d' % (
                bind_port,
                self._job_tracker_host(),
                tunnel_config['port'],
            ),
        ]

    def _pick_ssh_bind_ports(self):
        """Pick a list of ports to try binding our SSH tunnel to.

        We will try to bind the same port for any given cluster (Issue #67)
        """
        # don't perturb the random number generator
        random_state = random.getstate()
        try:
            # seed random port selection on cluster ID
            random.seed(self._cluster_id)
            num_picks = min(_MAX_SSH_RETRIES,
                            len(self._opts['ssh_bind_ports']))
            return random.sample(self._opts['ssh_bind_ports'], num_picks)
        finally:
            random.setstate(random_state)


def _patch_params(params, name, value):
    """Helper method for _add_extra_cluster_params().

    Set *name* in *params* to *value*

    If *name* has one or more dots in it, recursively set the value
    in successive nested dictionaries, creating them if necessary.
    For example, if *name* is ``Instances.EmrManagedMasterSecurityGroup``,
    set ``params['Instances']['EmrManagedMasterSecurityGroup']``

    If *value* is ``None``, delete the value (if it exists), rather than
    setting it to ``None``.
    """
    if not isinstance(params, dict):
        raise TypeError('must be a dictionary')

    if '.' in name:
        head, rest = name.split('.', 1)
        _patch_params(params.setdefault(head, {}), rest, value)
    elif value is None:
        if name in params:
            del params[name]
    elif isinstance(value, dict) and isinstance(params.get(name), dict):
        # recursively patch dicts rather than clobbering them (see #2154)
        for k, v in value.items():
            _patch_params(params[name], k, v)
    else:
        params[name] = value