-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathmain.py
More file actions
153 lines (131 loc) · 5.76 KB
/
main.py
File metadata and controls
153 lines (131 loc) · 5.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""
Copyright (C) 2023 Samsung Electronics Co. LTD
This software is a property of Samsung Electronics.
No part of this software, either material or conceptual may be copied or distributed, transmitted,
transcribed, stored in a retrieval system or translated into any human or computer language in any form by any means,
electronic, mechanical, manual or otherwise, or disclosed
to third parties without the express written permission of Samsung Electronics.
"""
"""
Reference : ocp/ocpmodels/main.py
The following items are modified and they can be claimed as properties of Samsung Electronics.
(1) Support more MLFF models (BPNN, NequIP, Allegro, and MACE)
(2) Support simulation indicators for the benchmark evaluation on simulations (RDF, ADF, EoS, PEW)
(3) Support more loss functions and metrics (loss.py and metric_evaluator.py in src/modules/)
(4) Support more learning rate schedulers (scheduler.py in src/modules/)
(5) Support normalization of per-atom energy (NormalizerPerAtom in src/modules/normalizer.py)
(6) Some different featurs are as follows:
(a) Print training results using PrettyTable
(b) Use a benchmark logger (named bm_logging) instead of the root logger (named logging in OCP)
(c) Remove features that includes to save prediction results and make the corresponding directory named 'results'
(d) Remove features related to HPO
(e) Set the identifier of an experiment using the starting time
"""
"""
Copyright (c) Facebook, Inc. and its affiliates.
This source code is licensed under the MIT license found in the
LICENSE file in the root directory of this source tree.
"""
import sys
import os
sys.path.insert(0, os.path.abspath("./"))
sys.path.insert(1, os.path.abspath("./codebases/ocp"))
sys.path.insert(2, os.path.abspath("./codebases/nequip"))
sys.path.insert(3, os.path.abspath("./codebases/allegro"))
sys.path.insert(4, os.path.abspath("./codebases/mace"))
import copy
import logging
from pathlib import Path
import submitit
from ocpmodels.common.utils import (
setup_logging,
build_config,
create_grid,
save_experiment_log,
)
# benchmark codes
from src.common.flags import benchmark_flags
from src.common.config import (
add_benchmark_config,
add_benchmark_fit_scale_config,
add_benchmark_validate_config,
build_run_md_config,
build_evaluate_config
)
from src.common.utils import new_trainer_context, new_evaluator_context
class Runner(submitit.helpers.Checkpointable):
def __init__(self):
self.config = None
def __call__(self, config):
if config["mode"] in ["train", "validate", "fit-scale"]:
with new_trainer_context(args=args, config=config) as ctx:
self.config = ctx.config
self.trainer = ctx.trainer
self.task = ctx.task
self.task.setup(self.trainer)
self.task.run()
elif config["mode"] in ["run-md", "evaluate"]:
with new_evaluator_context(args=args, config=config) as ctx:
self.config = ctx.config
self.evaluator = ctx.evaluator
self.task = ctx.task
self.task.setup(self.evaluator)
self.task.run()
def checkpoint(self, *args, **kwargs):
new_runner = Runner()
self.trainer.save(checkpoint_file="checkpoint.pt", training_state=True)
self.config["checkpoint"] = self.task.chkpt_path
self.config["timestamp_id"] = self.trainer.timestamp_id
if self.trainer.logger is not None:
self.trainer.logger.mark_preempting()
return submitit.helpers.DelayedSubmission(new_runner, self.config)
if __name__ == "__main__":
setup_logging()
parser = benchmark_flags.get_parser()
args, override_args = parser.parse_known_args()
if args.mode in ["train", "validate", "fit-scale"]:
config = build_config(args, override_args)
config = add_benchmark_config(config, args)
config = add_benchmark_validate_config(config, args)
config = add_benchmark_fit_scale_config(config, args)
elif args.mode == "run-md":
config = build_run_md_config(args)
elif args.mode == "evaluate":
config = build_evaluate_config(args)
if args.submit:
# Run on cluster (using the implemented job submission)
# Note that we did not / do not use this way for job submission.
slurm_add_params = config.get(
"slurm", None
) # additional slurm arguments
if args.sweep_yml: # Run grid search
configs = create_grid(config, args.sweep_yml)
else:
configs = [config]
logging.info(f"Submitting {len(configs)} jobs")
executor = submitit.AutoExecutor(
folder=args.logdir / "%j", slurm_max_num_timeout=3
)
executor.update_parameters(
name=args.identifier,
mem_gb=args.slurm_mem,
timeout_min=args.slurm_timeout * 60,
slurm_partition=args.slurm_partition,
gpus_per_node=args.num_gpus,
cpus_per_task=(config["optim"]["num_workers"] + 1),
tasks_per_node=(args.num_gpus if args.distributed else 1),
nodes=args.num_nodes,
slurm_additional_parameters=slurm_add_params,
)
for config in configs:
config["slurm"] = copy.deepcopy(executor.parameters)
config["slurm"]["folder"] = str(executor.folder)
jobs = executor.map_array(Runner(), configs)
logging.info(
f"Submitted jobs: {', '.join([job.job_id for job in jobs])}"
)
log_file = save_experiment_log(args, jobs, configs)
logging.info(f"Experiment log saved to: {log_file}")
else:
# Run locally or cluster (using job schedulers on Samsung Supercom instead of using args.submit)
Runner()(config)