pilot3/pilot.py at master · PanDAWMS/pilot3 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Authors:
# - Mario Lassnig, mario.lassnig@cern.ch, 2016-17
# - Daniel Drizhuk, d.drizhuk@gmail.com, 2017
# - Paul Nilsson, paul.nilsson@cern.ch, 2017-25

"""This is the entry point for the PanDA Pilot, executed with 'python3 pilot.py <args>'."""

import logging
import os
import sys
import threading
import time
from os import getcwd, chdir, environ
from os.path import exists, join
from shutil import rmtree
from typing import Any

from arguments import get_args
from pilot.common.errorcodes import ErrorCodes
from pilot.common.exception import PilotException
from pilot.common.pilotcache import get_pilot_cache
from pilot.info import infosys
from pilot.util.auxiliary import (
    convert_signal_to_exit_code,
    pilot_version_banner,
    shell_exit_code,
)
from pilot.util.batchsystem import is_htcondor_version_sufficient
from pilot.util.cgroups import create_cgroup
from pilot.util.config import config
from pilot.util.constants import (
    get_pilot_version,
    ERRNO_NOJOBS,
    FAILURE,
    PILOT_END_TIME,
    PILOT_MULTIJOB_START_TIME,
    PILOT_START_TIME,
    SERVER_UPDATE_NOT_DONE,
)
from pilot.util.cvmfs import (
    cvmfs_diagnostics,
    get_last_update,
    is_cvmfs_available,
)
from pilot.util.filehandling import (
    get_pilot_work_dir,
    mkdirs,
)
from pilot.util.harvester import (
    is_harvester_mode,
    kill_worker,
)
from pilot.util.heartbeat import update_pilot_heartbeat
from pilot.util.https import (
    get_panda_server,
    https_setup,
    send_update,
    update_local_oidc_token_info,
    get_memory_limits
)
from pilot.util.loggingsupport import establish_logging
from pilot.util.networking import dump_ipv6_info
from pilot.util.processgroups import find_defunct_subprocesses
from pilot.util.timing import add_to_pilot_timing
from pilot.util.workernode import (
    get_node_name,
    get_workernode_map,
    get_workernode_gpu_map
)

errors = ErrorCodes()
pilot_cache = get_pilot_cache()
mainworkdir = ""
args = None
trace = None


def main() -> int:  # noqa: C901
    """
    Prepare for and execute the requested workflow.

    :return: exit code (int).
    """
    # get the logger
    logger = logging.getLogger(__name__)

    # print the pilot version and other information
    pilot_version_banner()
    dump_ipv6_info()

    # define threading events
    args.graceful_stop = threading.Event()
    args.abort_job = threading.Event()
    args.job_aborted = threading.Event()

    # define useful variables
    args.retrieve_next_job = True  # go ahead and download a new job
    args.signal = None  # to store any incoming signals
    args.signal_counter = (
        0  # keep track of number of received kill signal (suicide counter)
    )
    args.kill_time = 0  # keep track of when first kill signal arrived

    # perform https setup
    if args.use_https:
        https_setup(args, get_pilot_version())
    args.amq = None

    # let the server know that the worker has started
    if args.update_server and args.workerpilotstatusupdate:
        send_worker_status(
            "started", args.queue, args.url, args.port, logger, "IPv6"
        )  # note: assuming IPv6, fallback in place

    #
    if "PTEST" in args.queue:
        args.getjob_failures = 1

    # check cvmfs if available (skip test if either NO_CVMFS_OK env var is set or pilot option --nocvmfs is used)
    if args.cvmfs:
        ec = check_cvmfs(logger)
        if ec:
            cvmfs_diagnostics()
            return ec

    if not args.rucio_host:
        args.rucio_host = config.Rucio.host

    # initialize InfoService
    try:
        infosys.init(args.queue)
        pilot_cache.queuedata = infosys.queuedata
        pilot_cache.harvester_submitmode = args.harvester_submitmode.lower()

        # check if queue is ACTIVE
        if infosys.queuedata.state != "ACTIVE":
            logger.critical(
                f"specified queue is NOT ACTIVE: {infosys.queuedata.name} -- aborting"
            )
            return errors.PANDAQUEUENOTACTIVE

        # make sure the queue is online
        if infosys.queuedata.status.lower() == "offline":
            logger.critical(
                f"specified queue is OFFLINE: {infosys.queuedata.name} -- aborting"
            )
            return errors.PANDAQUEUENOTONLINE

    except PilotException as error:
        logger.fatal(error)
        return error.get_error_code()

    # update the OIDC token if necessary (after queuedata has been downloaded, since PQ.catchall can contain instruction to prevent token renewal)
    if 'no_token_renewal' in infosys.queuedata.catchall or args.token_renewal is False:
        logger.info("OIDC token will not be renewed by the pilot")
    else:
        try:
            update_local_oidc_token_info(args.url, args.port)
        except Exception as exc:
            logger.warning(f"failed to update local OIDC token: {exc}")

    # create and report the worker node map
    # note: the worker node map will always be created, but only sent to the server
    # if the user plugin specifies it. For ATLAS there is a special case for Nordugrid (args.update_server == False)
    # in which the map is not sent to the PanDA server in this function, but later when the jobReport is uploaded
    pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
    user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0)
    if user.allow_send_workernode_map():
        try:
            send_workernode_map(infosys.queuedata.site, infosys.queuedata.name, args.url, args.port, "IPv6", logger)  # note: assuming IPv6, fallback in place
        except Exception as error:
            logger.warning(f"exception caught when sending workernode map: {error}")
        if args.update_server:
            try:
                memory_limits = get_memory_limits(args.url, args.port)
            except Exception as error:
                logger.warning(f"exception caught when getting resource types: {error}")
            else:
                logger.debug(f"resource types: {memory_limits}")
                if memory_limits:
                    pilot_cache.resource_types = memory_limits

    # handle special CRIC variables via params
    # internet protocol versions 'IPv4' or 'IPv6' can be set via CRIC PQ.params.internet_protocol_version
    # (must be defined per PQ if wanted). The pilot default is IPv6
    args.internet_protocol_version = (
        infosys.queuedata.params.get("internet_protocol_version", "IPv6")
        if infosys.queuedata.params
        else "IPv6"
    )
    environ["PILOT_IP_VERSION"] = args.internet_protocol_version

    # set the site name for rucio
    environ["PILOT_RUCIO_SITENAME"] = (
        os.environ.get("PILOT_RUCIO_SITENAME", "") or infosys.queuedata.site
    )
    logger.debug(f'PILOT_RUCIO_SITENAME={os.environ.get("PILOT_RUCIO_SITENAME")}')

    # store the site name as set with a pilot option
    environ[
        "PILOT_SITENAME"
    ] = infosys.queuedata.resource  # args.site  # TODO: replace with singleton

    logger.info(f"pilot arguments: {args}")

    # update the pilot heartbeat file
    update_pilot_heartbeat(time.time())

    # set requested workflow
    workflow = __import__(
        f"pilot.workflow.{args.workflow}", globals(), locals(), [args.workflow], 0
    )

    # execute workflow
    try:
        exitcode = workflow.run(args)
    except Exception as exc:
        logger.fatal(f"main pilot function caught exception: {exc}")
        exitcode = None

    # let the server know that the worker has finished
    if args.update_server and args.workerpilotstatusupdate:
        send_worker_status(
            "finished",
            args.queue,
            args.url,
            args.port,
            logger,
            args.internet_protocol_version,
        )

    return exitcode


def check_cvmfs(logger: Any) -> int:
    """
    Check if cvmfs is available.

    Args:
        logger (Any): Logging object.

    Returns:
        int: Exit code.
    """
    # skip all tests if required
    if os.environ.get("NO_CVMFS_OK", False):
        logger.info("skipping cvmfs checks")
        return 0

    is_available = is_cvmfs_available()
    if is_available is None:
        pass  # ignore this case
    elif is_available is True:
        timestamp = get_last_update()
        if timestamp and timestamp > 0:
            logger.info('CVMFS has been validated')
        else:
            logger.warning('CVMFS is not responding - aborting pilot')
            return errors.CVMFSISNOTALIVE
    else:
        logger.warning('CVMFS is not alive - aborting pilot')
        return errors.CVMFSISNOTALIVE

    return 0


def create_main_work_dir() -> (int, str):
    """
    Create and return the pilot's main work directory.

    The function also sets args.mainworkdir and cd's into this directory.
    Note: args, used in this function, is defined in outer scope.

    Returns:
        tuple: exit code (int), main work directory (str).
    """
    exitcode = 0

    if args.workdir != "":
        _mainworkdir = get_pilot_work_dir(args.workdir)
        try:
            # create the main PanDA Pilot work directory
            mkdirs(_mainworkdir)
        except PilotException as error:
            # print to stderr since logging has not been established yet
            print(
                f"failed to create workdir at {_mainworkdir} -- aborting: {error}",
                file=sys.stderr,
            )
            exitcode = shell_exit_code(error._error_code)
    else:
        _mainworkdir = getcwd()

    args.mainworkdir = _mainworkdir
    chdir(_mainworkdir)

    return exitcode, _mainworkdir


def set_environment_variables():
    """
    Set relevant environment variables.

    This function sets PILOT_WORK_DIR, PILOT_HOME, PILOT_SITENAME, PILOT_USER and PILOT_VERSION and others.
    Note: args and mainworkdir, used in this function, are defined in outer scope.
    """
    # working directory as set with a pilot option (e.g. ..)
    environ["PILOT_WORK_DIR"] = args.workdir  # TODO: replace with singleton
    pilot_cache.pilot_work_dir = args.workdir

    # main work directory (e.g. /scratch/PanDA_Pilot3_3908_1537173670)
    environ["PILOT_HOME"] = mainworkdir  # TODO: replace with singleton
    pilot_cache.pilot_home_dir = mainworkdir

    # how many stage-out attempts should be made per file?
    # NOTE: do not use the pilot cache for this since it complicates middleware containerization
    # pilot_cache.stageout_attempts = args.stageout_attempts
    os.environ['PILOT_STAGEOUT_ATTEMPTS'] = str(args.stageout_attempts)

    # pilot source directory (e.g. /cluster/home/usatlas1/gram_scratch_hHq4Ns/condorg_oqmHdWxz)
    if not environ.get("PILOT_SOURCE_DIR", None):
        environ["PILOT_SOURCE_DIR"] = args.sourcedir  # TODO: replace with singleton
        pilot_cache.pilot_source_dir = args.sourcedir

    # set the pilot user (e.g. ATLAS)
    environ["PILOT_USER"] = args.pilot_user  # TODO: replace with singleton

    # internal pilot state
    environ["PILOT_JOB_STATE"] = "startup"  # TODO: replace with singleton
    pilot_cache.pilot_job_state = "startup"

    # set the pilot version
    environ["PILOT_VERSION"] = get_pilot_version()
    pilot_cache.pilot_version = get_pilot_version()

    # set the default wrap-up/finish instruction
    environ["PILOT_WRAP_UP"] = "NORMAL"

    # proxy verifications
    environ["PILOT_PROXY_VERIFICATION"] = f"{args.verify_proxy}"
    environ["PILOT_PAYLOAD_PROXY_VERIFICATION"] = f"{args.verify_payload_proxy}"

    # keep track of the server updates, if any
    environ["SERVER_UPDATE"] = SERVER_UPDATE_NOT_DONE

    # set the (HPC) resource name (if set in options)
    environ["PILOT_RESOURCE_NAME"] = args.hpc_resource

    # allow for the possibility of turning off rucio traces
    environ["PILOT_USE_RUCIO_TRACES"] = f"{args.use_rucio_traces}"

    # event service executor type
    environ["PILOT_ES_EXECUTOR_TYPE"] = args.executor_type

    if args.output_dir:
        environ["PILOT_OUTPUT_DIR"] = args.output_dir

    # keep track of the server urls
    environ["PANDA_SERVER_URL"] = get_panda_server(
        args.url, args.port, update_server=args.update_server
    )
    environ["QUEUEDATA_SERVER_URL"] = f"{args.queuedata_url}"
    if args.storagedata_url:
        environ["STORAGEDATA_SERVER_URL"] = f"{args.storagedata_url}"

    # should cgroups be used for process management?
    pilot_cache.use_cgroups = is_htcondor_version_sufficient() if args.pilot_user.lower() == 'atlas' else False

    # create a cgroup for the pilot
    if pilot_cache.use_cgroups:
        _ = create_cgroup()


def wrap_up() -> int:
    """
    Perform cleanup and terminate logging.

    Note: args and mainworkdir, used in this function, are defined in outer scope.

    Returns:
        int: exit code.
    """
    # cleanup pilot workdir if created
    if args.sourcedir != mainworkdir and args.cleanup:
        chdir(args.sourcedir)
        try:
            rmtree(mainworkdir)
        except OSError as exc:
            logging.warning(f"failed to remove {mainworkdir}: {exc}")
        else:
            logging.info(f"removed {mainworkdir}")

    # in Harvester mode, create a kill_worker file that will instruct Harvester that the pilot has finished
    if args.harvester:
        kill_worker()

    exitcode, shellexitcode = get_proper_exit_code()
    logging.info(f"pilot has finished (exit code={exitcode}, shell exit code={shellexitcode})")
    logging.shutdown()

    return shellexitcode


def get_proper_exit_code() -> (int, int):
    """
    Return the proper exit code.

    Returns:
        Tuple[int, int]: A tuple containing the exit code and the shell exit code.
    """
    try:
        exitcode = trace.pilot["error_code"]
    except (KeyError, AttributeError):
        exitcode = trace
        logging.debug(f"trace was not a class, trace={trace}")
    else:
        logging.info(f"traces error code: {exitcode}")
        if trace.pilot["nr_jobs"] <= 1:
            if exitcode != 0:
                logging.info(
                    f"an exit code was already set: {exitcode} (will be converted to a standard shell code)"
                )
        elif trace.pilot["nr_jobs"] > 0:
            if trace.pilot["nr_jobs"] == 1:
                logging.getLogger(__name__).info(
                    "pilot has finished 1 job was processed)"
                )
            else:
                logging.getLogger(__name__).info(
                    f"pilot has finished ({trace.pilot['nr_jobs']} jobs were processed)"
                )
        elif trace.pilot["state"] == FAILURE:
            logging.critical("pilot workflow failure -- aborting")
        elif trace.pilot["state"] == ERRNO_NOJOBS:
            logging.critical("pilot did not process any events -- aborting")
            exitcode = ERRNO_NOJOBS

    try:
        exitcode = int(exitcode)
    except TypeError as exc:
        logging.warning(f"failed to convert exit code to int: {exitcode}, {exc}")
        exitcode = 1008

    if exitcode == 0 and args.signal:
        exitcode = convert_signal_to_exit_code(args.signal)
    sec = shell_exit_code(exitcode)

    return exitcode, sec


def get_pilot_source_dir() -> str:
    """
    Return the pilot source directory.

    Returns:
        str: Full path to pilot source directory.
    """
    cwd = getcwd()
    if exists(
        join(join(cwd, "pilot3"), "pilot.py")
    ):  # in case wrapper has untarred src as pilot3 in init dir
        cwd = join(cwd, "pilot3")
    return cwd


def send_worker_status(
    status: str,
    queue: str,
    url: str,
    port: int,
    logger: Any,
    internet_protocol_version: str,
):
    """
    Send worker info to the server to let it know that the worker has started.

    Note: the function can fail, but if it does, it will be ignored.

    Args:
        status (str): 'started' or 'finished'.
        queue (str): PanDA queue name.
        url (str): Server URL.
        port (int): Server port.
        logger (Any): Logging object.
        internet_protocol_version (str): Internet protocol version, IPv4 or IPv6.
    """
    # worker node structure to be sent to the server
    data = {}
    try:
        data["worker_id"] = int(os.environ.get("HARVESTER_WORKER_ID", None))
    except (ValueError, TypeError):
        logger.warning("failed to convert worker_id to int, worker_id will not be set in worker status update")
    data["harvester_id"] = os.environ.get("HARVESTER_ID", None)
    data["status"] = status
    data["node_id"] = get_node_name()

    # attempt to send the worker info to the server
    if data["worker_id"] and data["harvester_id"]:
        send_update(
            "api/v1/pilot/update_worker_status", data, url, port, ipv=internet_protocol_version, max_attempts=2
        )
    else:
        logger.warning("workerID/harvesterID not known, will not send worker status to server")


def send_workernode_map(
        site: str,
        queue: str,
        url: str,
        port: int,
        internet_protocol_version: str,
        logger: Any,
):
    """
    Send worker node map and GPU info to the server.

    Args:
        site (str): ATLAS site name.
        queue (str): PanDA queue name.
        url (str): Server URL.
        port (int): Server port.
        internet_protocol_version (str): Internet protocol version, IPv4 or IPv6.
        logger (Any): Logging object.
    """
    # should the worker node map be sent to the server at this point or later when the job report is sent?
    send_now = True if args.update_server else False

    # worker node structure to be sent to the server
    try:
        data = get_workernode_map(site, queue)
    except Exception as e:
        logger.warning(f"exception caught when calling get_workernode_map(): {e}")
    if send_now:
        try:
            send_update("api/v1/pilot/update_worker_node", data, url, port, ipv=internet_protocol_version, max_attempts=1)
        except Exception as e:
            logger.warning(f"exception caught when sending worker node map to server: {e}")

    # GPU info
    try:
        data = get_workernode_gpu_map(site)
    except Exception as e:
        logger.warning(f"exception caught when calling get_workernode_gpu_map(): {e}")
    if send_now and data:  # only send if data is not empty
        try:
            send_update("api/v1/pilot/update_worker_node_gpu", data, url, port, ipv=internet_protocol_version, max_attempts=1)
        except Exception as e:
            logger.warning(f"exception caught when sending worker node map to server: {e}")


def set_lifetime():
    """Update the pilot lifetime if set by an environment variable (PANDAPILOT_LIFETIME) (in seconds)."""
    lifetime = os.environ.get("PANDAPILOT_LIFETIME", None)
    if lifetime:
        try:
            _lifetime = int(lifetime)
        except (ValueError, TypeError):
            pass
        else:
            args.lifetime = _lifetime


def set_redirectall():
    """
    Set args redirectall field.

    Currently not used.
    """
    redirectall = os.environ.get("PANDAPILOT_REDIRECTALL", False)
    if not redirectall:
        try:
            redirectall = bool(redirectall)
        except (ValueError, TypeError):
            pass
        else:
            args.redirectall = redirectall


def list_zombies():
    """
    Make sure there are no remaining defunct processes still lingering.

    Note: can be used to find zombies, but zombies can't be killed..
    """
    found = find_defunct_subprocesses(os.getpid())
    if found:
        logging.info(f"found these defunct processes: {found}")
    else:
        logging.info("no defunct processes were found")


if __name__ == "__main__":
    # get the args from the arg parser
    args = get_args()
    args.last_heartbeat = time.time()  # keep track of server heartbeats
    args.pilot_heartbeat = time.time()  # keep track of pilot heartbeats

    # Define and set the main harvester control boolean
    args.harvester = is_harvester_mode(args)

    # initialize the pilot timing dictionary
    args.timing = {}  # TODO: move to singleton?

    # initialize job status dictionary (e.g. used to keep track of log transfers)
    args.job_status = {}  # TODO: move to singleton or to job object directly?

    # if 'BNL_OSG_SPHENIX_TEST' in args.queue:
    #    args.lifetime = 3600
    #    args.subscribe_to_msgsvc = True
    #    args.redirectstdout = '/dev/null'

    # store T0 time stamp
    add_to_pilot_timing("0", PILOT_START_TIME, time.time(), args)
    add_to_pilot_timing("1", PILOT_MULTIJOB_START_TIME, time.time(), args)

    # if requested by the wrapper via a pilot option, create the main pilot workdir and cd into it
    args.sourcedir = getcwd()  # get_pilot_source_dir()

    exit_code, mainworkdir = create_main_work_dir()
    if exit_code != 0:
        sys.exit(exit_code)

    set_lifetime()

    # setup and establish standard logging
    establish_logging(
        debug=args.debug, nopilotlog=args.nopilotlog, redirectstdout=args.redirectstdout
    )

    # set environment variables (to be replaced with singleton implementation)
    set_environment_variables()

    # execute main function
    trace = main()

    # store final time stamp (cannot be placed later since the mainworkdir is about to be purged)
    add_to_pilot_timing("0", PILOT_END_TIME, time.time(), args, store=False)

    # make sure the pilot does not leave any lingering defunct child processes behind
    if args.debug:
        list_zombies()

    # perform cleanup and terminate logging
    exit_code = wrap_up()

    # the end.
    sys.exit(exit_code)