diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt index 086633cf043..c8c6fec9ac9 100644 --- a/.ci/docker/requirements.txt +++ b/.ci/docker/requirements.txt @@ -11,12 +11,14 @@ sphinx-copybutton==0.5.2 sphinx_sitemap==2.7.1 sphinxcontrib-mermaid==1.0.0 sphinxcontrib.katex==0.9.10 +sphinx_tippy==0.4.3 pypandoc==1.15 pandocfilters==1.5.1 markdown==3.8.2 # PyTorch Theme -pytorch_sphinx_theme2==0.2.0 +#pytorch_sphinx_theme2==0.2.0 +git+https://github.com/pytorch/pytorch_sphinx_theme.git@5b6d2df5660d2ccf4b34cf819b7ab7c69f65f20d#egg=pytorch_sphinx_theme2 # Tutorial dependencies tqdm==4.66.1 diff --git a/advanced_source/cpp_autograd.rst b/advanced_source/cpp_autograd.rst index 51e5e0b358f..5bc488da2b1 100644 --- a/advanced_source/cpp_autograd.rst +++ b/advanced_source/cpp_autograd.rst @@ -15,7 +15,7 @@ Basic autograd operations (Adapted from `this tutorial `_) -Create a tensor and set ``torch::requires_grad()`` to track computation with it +Create a :term:`tensor` and set ``torch::requires_grad()`` to track computation with it .. code-block:: cpp @@ -64,7 +64,7 @@ Do more operations on ``y`` auto z = y * y * 3; auto out = z.mean(); - + std::cout << z << std::endl; std::cout << z.grad_fn()->name() << std::endl; std::cout << out << std::endl; @@ -90,10 +90,10 @@ Out: auto a = torch::randn({2, 2}); a = ((a * 3) / (a - 1)); std::cout << a.requires_grad() << std::endl; - + a.requires_grad_(true); std::cout << a.requires_grad() << std::endl; - + auto b = (a * a).sum(); std::cout << b.grad_fn()->name() << std::endl; @@ -106,13 +106,13 @@ Out: SumBackward0 Let's backprop now. Because ``out`` contains a single scalar, ``out.backward()`` -is equivalent to ``out.backward(torch::tensor(1.))``. +is equivalent to ``out.backward(torch::tensor(1.))``. This is part of the :term:`backward pass`. .. code-block:: cpp out.backward(); -Print gradients d(out)/dx +Print :term:`gradients` d(out)/dx .. code-block:: cpp @@ -134,12 +134,12 @@ Now let's take a look at an example of vector-Jacobian product: .. code-block:: cpp x = torch::randn(3, torch::requires_grad()); - + y = x * 2; while (y.norm().item() < 1000) { y = y * 2; } - + std::cout << y << std::endl; std::cout << y.grad_fn()->name() << std::endl; @@ -159,7 +159,7 @@ If we want the vector-Jacobian product, pass the vector to ``backward`` as argum auto v = torch::tensor({0.1, 1.0, 0.0001}, torch::kFloat); y.backward(v); - + std::cout << x.grad() << std::endl; Out: @@ -178,7 +178,7 @@ either by putting ``torch::NoGradGuard`` in a code block std::cout << x.requires_grad() << std::endl; std::cout << x.pow(2).requires_grad() << std::endl; - + { torch::NoGradGuard no_grad; std::cout << x.pow(2).requires_grad() << std::endl; @@ -218,31 +218,31 @@ please see `the corresponding C++ API docs ` is calculating :term:`gradient` penalty. Let's see an example of it using ``torch::autograd::grad``: .. code-block:: cpp #include - + auto model = torch::nn::Linear(4, 3); - + auto input = torch::randn({3, 4}).requires_grad_(true); auto output = model(input); - + // Calculate loss auto target = torch::randn({3, 3}); auto loss = torch::nn::MSELoss()(output, target); - + // Use norm of gradients as penalty auto grad_output = torch::ones_like(output); auto gradient = torch::autograd::grad({output}, {input}, /*grad_outputs=*/{grad_output}, /*create_graph=*/true)[0]; auto gradient_penalty = torch::pow((gradient.norm(2, /*dim=*/1) - 1), 2).mean(); - + // Add gradient penalty to loss auto combined_loss = loss + gradient_penalty; combined_loss.backward(); - + std::cout << input.grad() << std::endl; Out: @@ -277,14 +277,14 @@ Below you can find code for a ``Linear`` function from ``torch::nn``: .. code-block:: cpp #include - + using namespace torch::autograd; - + // Inherit from Function class LinearFunction : public Function { public: // Note that both forward and backward are static functions - + // bias is an optional argument static torch::Tensor forward( AutogradContext *ctx, torch::Tensor input, torch::Tensor weight, torch::Tensor bias = torch::Tensor()) { @@ -295,13 +295,13 @@ Below you can find code for a ``Linear`` function from ``torch::nn``: } return output; } - + static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) { auto saved = ctx->get_saved_variables(); auto input = saved[0]; auto weight = saved[1]; auto bias = saved[2]; - + auto grad_output = grad_outputs[0]; auto grad_input = grad_output.mm(weight); auto grad_weight = grad_output.t().mm(input); @@ -309,7 +309,7 @@ Below you can find code for a ``Linear`` function from ``torch::nn``: if (bias.defined()) { grad_bias = grad_output.sum(0); } - + return {grad_input, grad_weight, grad_bias}; } }; @@ -322,7 +322,7 @@ Then, we can use the ``LinearFunction`` in the following way: auto weight = torch::randn({4, 3}).requires_grad_(); auto y = LinearFunction::apply(x, weight); y.sum().backward(); - + std::cout << x.grad() << std::endl; std::cout << weight.grad() << std::endl; @@ -344,9 +344,9 @@ Here, we give an additional example of a function that is parametrized by non-te .. code-block:: cpp #include - + using namespace torch::autograd; - + class MulConstant : public Function { public: static torch::Tensor forward(AutogradContext *ctx, torch::Tensor tensor, double constant) { @@ -355,7 +355,7 @@ Here, we give an additional example of a function that is parametrized by non-te ctx->saved_data["constant"] = constant; return tensor * constant; } - + static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) { // We return as many input gradients as there were arguments. // Gradients of non-tensor arguments to forward must be `torch::Tensor()`. diff --git a/advanced_source/cpp_export.rst b/advanced_source/cpp_export.rst index 56c4bcbaae7..25ac0713bfd 100644 --- a/advanced_source/cpp_export.rst +++ b/advanced_source/cpp_export.rst @@ -1,3 +1,3 @@ .. warning:: - TorchScript is deprecated, please use - `torch.export `__ instead. \ No newline at end of file + :term:`TorchScript` is deprecated, please use + `torch.export `__ instead. diff --git a/advanced_source/cpp_frontend.rst b/advanced_source/cpp_frontend.rst index 968afa01b23..56b8d587f8a 100644 --- a/advanced_source/cpp_frontend.rst +++ b/advanced_source/cpp_frontend.rst @@ -36,10 +36,12 @@ This tutorial will walk you through an end-to-end example of training a model with the C++ frontend. Concretely, we will be training a `DCGAN `_ -- a kind of generative model -- to generate images of MNIST digits. While conceptually a simple example, it should -be enough to give you a whirlwind overview of the PyTorch C++ frontend and wet -your appetite for training more complex models. We will begin with some -motivating words for why you would want to use the C++ frontend to begin with, -and then dive straight into defining and training our model. +be enough to give you a whirlwind overview of the PyTorch C++ frontend and whet +your appetite for training more complex models. + +We'll begin with some motivating words for why you would want to use the C++ frontend to begin with, +and then dive straight into defining and training our model. In this tutorial, we'll train +a model on :term:`GPU` for optimal performance. .. tip:: @@ -960,8 +962,8 @@ Writing the Training Loop Let's now finish the algorithmic part of our example and implement the delicate dance between the generator and discriminator. First, we'll create two -optimizers, one for the generator and one for the discriminator. The optimizers -we use implement the `Adam `_ algorithm: +optimizers, one for the generator and one for the discriminator. +The :term:`optimizer`s we use implement the `Adam `_ algorithm: .. code-block:: cpp diff --git a/advanced_source/custom_class_pt2.rst b/advanced_source/custom_class_pt2.rst index 229a94f2ce9..f3c43016ddd 100644 --- a/advanced_source/custom_class_pt2.rst +++ b/advanced_source/custom_class_pt2.rst @@ -247,7 +247,7 @@ After re-compilation, we can export the custom op with: Why do we need to make a Fake Class? ------------------------------------ -Tracing with real custom object has several major downsides: +:term:`Tracing` with real custom object has several major downsides: 1. Operators on real objects can be time consuming e.g. the custom object might be reading from the network or loading data from the disk. diff --git a/advanced_source/dispatcher.rst b/advanced_source/dispatcher.rst index 4b03803c15b..f824fe3d004 100644 --- a/advanced_source/dispatcher.rst +++ b/advanced_source/dispatcher.rst @@ -18,7 +18,7 @@ of another. Here is a sampling of some of the things it handles: depending on whether or not autograd handling is necessary. * Applying autocasting when necessary for automatic mixed precision. * Applying batching rules when an operator is run under a ``vmap`` call. -* Tracing execution of operations, if you are tracing a model for export. +* :term:`Tracing` execution of operations, if you are tracing a model for export. If in your `custom operator code `_ you find yourself manually writing if statements to handle these cases, the dispatcher APIs can @@ -403,8 +403,8 @@ a kernel at the Batched dispatch key. Tracer ^^^^^^ -The Tracer dispatch key implements support for recording invocations of operators -into a trace when you run ``torch.jit.trace``. We intend to provide a +The Tracer dispatch key implements support for recording invocations of :term:`operations` +into a trace when you run ``torch.jit.trace`` (:term:`Tracing`). boxed fallback that will implement tracing for arbitrary operations, see `issue #41478 `_ to track progress. diff --git a/advanced_source/extend_dispatcher.rst b/advanced_source/extend_dispatcher.rst index 12f15355f5f..2422a5ef337 100644 --- a/advanced_source/extend_dispatcher.rst +++ b/advanced_source/extend_dispatcher.rst @@ -3,10 +3,13 @@ Extending dispatcher for a new backend in C++ In this tutorial we will walk through all necessary steps to extend the dispatcher to add a new device living outside ``pytorch/pytorch`` repo and maintain it to keep in -sync with native PyTorch devices. Here we'll assume that you're familiar with how +with native PyTorch devices. Here we'll assume that you're familiar with how to `register a dispatched operator in C++ `_ and how to write a `custom autograd function `_. +Note: This tutorial covers extending the dispatcher for custom backends that +implement :term:`device kernels` for :term:`operations`. + .. note:: @@ -295,7 +298,7 @@ JIT support As we mentioned in `Registering a Dispatched Operator in C++ `_, kernels registered through `m.impl()` API support being called in both unboxed and boxed ways. In other words your customized backend can also work with our -JIT tracing/scripting frontend just like the in-tree backends like CPU or CUDA do. You could potentially also write specialized optimization +:term:`JIT` :term:`tracing`/:term:`scripting` frontend just like the in-tree backends like CPU or CUDA do. passes for your backend on a JIT graph. But we will not discuss it here since we haven't finalized the integration point in JIT, so the current backend support will focus on the eager frontend for now. @@ -377,4 +380,3 @@ any feature requests or bug reports, please `file an issue on github `__ instead. \ No newline at end of file + :term:`TorchScript` is deprecated, please use + `torch.export `__ instead. diff --git a/advanced_source/torch_script_custom_ops.rst b/advanced_source/torch_script_custom_ops.rst index 01bc497d38e..dceeb7d78bb 100644 --- a/advanced_source/torch_script_custom_ops.rst +++ b/advanced_source/torch_script_custom_ops.rst @@ -2,5 +2,5 @@ TODO(gmagogsfm): Replace/delete this document by 2.9 release. https://github.com/pytorch/tutorials/issues/3456 .. warning:: - TorchScript is deprecated, please use - `torch.export `__ instead. \ No newline at end of file + :term:`TorchScript` is deprecated, please use + `torch.export `__ instead. diff --git a/compilers_index.rst b/compilers_index.rst index ec426cecc80..4820a1627a8 100644 --- a/compilers_index.rst +++ b/compilers_index.rst @@ -10,7 +10,7 @@ control, as well as third-party backend solutions. .. warning:: - TorchScript is no longer in active development. + :term:`TorchScript` is no longer in active development. .. raw:: html diff --git a/conf.py b/conf.py index 67227c0784b..cb730ca8a5c 100644 --- a/conf.py +++ b/conf.py @@ -141,8 +141,20 @@ def wrapper(*args, **kwargs): "sphinx_sitemap", "sphinx_reredirects", "sphinxcontrib.mermaid", + "sphinx_tippy", ] +# sphinx-tippy configuration +tippy_props = { + "placement": "auto-start", + "maxWidth": 500, + "interactive": True, # Allow clicking links inside tooltips + "theme": "material", +} + +# Skip all URLs except glossary term links (glossary.html#term-*) +tippy_skip_urls = (r"^(?!.*glossary\.html#term-).*$",) + intersphinx_mapping = { "torch": ("https://docs.pytorch.org/docs/stable/", None), "tensordict": ("https://docs.pytorch.org/tensordict/stable", None), diff --git a/glossary.rst b/glossary.rst new file mode 100644 index 00000000000..7c0bfb823cb --- /dev/null +++ b/glossary.rst @@ -0,0 +1,172 @@ +.. _glossary: + +================ +PyTorch Glossary +================ + +This glossary provides definitions for terms commonly used in PyTorch documentation. + +.. glossary:: + :sorted: + + ATen + Short for "A Tensor Library". The foundational tensor and mathematical + operation library on which all else is built. + + attention mechanism + A technique used in deep learning models, particularly transformer architectures, + to selectively focus on certain input elements or tokens when computing output + representations, improving performance and interpretability. + + backward pass + The backward pass is part of the backpropagation algorithm where the error + gradients are computed and propagated backwards through the network, adjusting + the weights and biases to minimize the loss. + + backpropagation + An essential algorithm in training neural networks. It calculates the gradient + of the loss function with respect to the model's parameters, allowing the + network to learn from its mistakes and improve over time. + + CNN + Convolutional Neural Network: A type of neural network designed for image and + video processing, using convolutional and pooling layers to extract features. + + Compound Kernel + Opposed to :term:`Device Kernel`, Compound kernels are usually + device-agnostic and belong to :term:`Compound Operation`. + + Compound Operation + A Compound Operation is composed of other operations. Its kernel is usually + device-agnostic. Normally it doesn't have its own derivative functions defined. + Instead, AutoGrad automatically computes its derivative based on operations it + uses. + + Composite Operation + Same as :term:`Compound Operation`. + + Convolutional Neural Network + A type of neural network designed for image and video processing, using + convolutional and pooling layers to extract features. Also known as CNN. + + CUDA + Compute Unified Device Architecture: A parallel computing platform developed + by NVIDIA that allows developers to use GPUs for general-purpose computing, + including machine learning and deep learning applications. + + Custom Operation + An Operation that is defined by users and is usually a :term:`Compound Operation`. + For example, this `tutorial `_ + details how to create Custom Operations. + + Device Kernel + Device-specific kernel of a :term:`Leaf Operation`. + + embedding + A way to represent categorical variables as dense vectors, often used in + natural language processing and recommender systems. + + epoch + An epoch is a unit of measurement in machine learning that represents one + complete pass through the entire training dataset. During each epoch, the + model's weights are updated based on the loss calculated from the predictions + made on the training data. + + forward pass + The forward pass is the process of passing input data through a neural network + to obtain an output prediction. It's the first step in training a model, + followed by the backward pass and optimization. + + GPU + Graphics Processing Unit: A specialized electronic circuit designed to quickly + manipulate and alter memory to accelerate computations. In the context of AI + and machine learning, GPUs are used to accelerate computationally intensive + tasks like training neural networks. + + gradient + In machine learning, the gradient represents the rate of change of the loss + function with respect to the model's parameters. It's used in backpropagation + to update the weights and biases during training. + + Inductor + A PyTorch component that enables just-in-time (JIT) compilation of PyTorch + models, allowing for faster inference times and better performance on CPUs + and GPUs. It is the default backend for torch.compile. + + inference + The process of making predictions or drawing conclusions from a trained AI + model, typically involving the application of the learned relationships to + new, unseen data. + + JIT + Just-In-Time Compilation: A compilation technique where code is compiled into + machine code at runtime, just before it is executed. + + Kernel + Implementation of a PyTorch operation, specifying what should be done when an + operation executes. + + Leaf Operation + An operation that's considered a basic operation, as opposed to a :term:`Compound + Operation`. Leaf Operation always has dispatch functions defined, usually has a + derivative function defined as well. + + loss function + A loss function, also known as a cost function, is a mathematical function + used to evaluate the performance of a machine learning model during training, + providing a measure of how well the model is doing. + + LSTM + Long Short-Term Memory Network: A type of recurrent neural network (RNN) + designed to handle sequential data with long-term dependencies. LSTMs use + memory cells and gates to selectively retain information over time. + + Native Operation + An operation that comes natively with PyTorch ATen, for example ``aten::matmul``. + + Non-Leaf Operation + Same as :term:`Compound Operation`. + + Operation + A unit of work. For example, the work of matrix multiplication is an operation + called ``aten::matmul``. + + optimizer + An algorithm used to update the weights and biases of a neural network during + training to minimize the loss function. Common optimizers include SGD, Adam, + and RMSprop. + + quantization + A technique used to reduce the precision of numerical values in a deep learning + model, often to reduce memory usage, improve performance, and enable deployment + on resource-constrained devices. + + RNN + Recurrent Neural Network: A type of neural network designed for sequential data, + using recurrent connections to capture temporal dependencies. + + Scripting + Using ``torch.jit.script`` on a function to inspect source code and compile it as + :term:`TorchScript` code. + + tensor + Tensors are a specialized data structure that are very similar to arrays and + matrices. In PyTorch, tensors are used to encode the inputs and outputs of a + model, as well as the model's parameters. + + torch.compile + A PyTorch function that compiles PyTorch code into an optimized form, allowing + for faster execution and better performance. It is the main entry point for + PyTorch 2.x optimizations. + + TorchScript + Deprecated. Use :term:`torch.compile` instead. + + Tracing + Using ``torch.jit.trace`` on a function to get an executable that can be optimized + using just-in-time compilation. + + transformer + A type of neural network architecture introduced in the paper "Attention is All + You Need" (Vaswani et al., 2017), which relies entirely on self-attention + mechanisms to process sequential data, such as text or images. diff --git a/index.rst b/index.rst index 5a5e80abfbb..d0dc5507206 100644 --- a/index.rst +++ b/index.rst @@ -862,3 +862,9 @@ Additional Resources :hidden: prototype/prototype_index + +.. toctree:: + :maxdepth: 1 + :hidden: + + glossary diff --git a/recipes_source/compiling_optimizer.rst b/recipes_source/compiling_optimizer.rst index 951495ca4fa..2352116a983 100644 --- a/recipes_source/compiling_optimizer.rst +++ b/recipes_source/compiling_optimizer.rst @@ -1,12 +1,12 @@ -(beta) Compiling the optimizer with torch.compile +(beta) Compiling the :term:`optimizer` with :term:`torch.compile` ========================================================================================== **Author:** `Michael Lazos `_ -The optimizer is a key algorithm for training any deep learning model. +The :term:`optimizer` is a key algorithm for training any deep learning model. Since it is responsible for updating every model parameter, it can often -become the bottleneck in training performance for large models. In this recipe, -we will apply ``torch.compile`` to the optimizer to observe the GPU performance +become the bottleneck in training performance for large models. In this recipe, +we will apply ``torch.compile`` to the optimizer to observe the :term:`GPU` performance improvement. .. note:: @@ -24,7 +24,7 @@ Depending on what machine you are using, your exact results may vary. .. code-block:: python import torch - + model = torch.nn.Sequential( *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)] ) @@ -39,7 +39,7 @@ and create a helper function to wrap the step() in ``torch.compile()``. .. note:: - + ``torch.compile`` is only supported on cuda devices with compute capability >= 7.0 .. code-block:: python @@ -57,12 +57,12 @@ in ``torch.compile()``. @torch.compile(fullgraph=False) def fn(): opt.step() - - + + # Let's define a helpful benchmarking function: import torch.utils.benchmark as benchmark - - + + def benchmark_torch_function_in_microseconds(f, *args, **kwargs): t0 = benchmark.Timer( stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f} @@ -73,12 +73,12 @@ in ``torch.compile()``. # Warmup runs to compile the function for _ in range(5): fn() - + eager_runtime = benchmark_torch_function_in_microseconds(opt.step) compiled_runtime = benchmark_torch_function_in_microseconds(fn) - + assert eager_runtime > compiled_runtime - + print(f"eager runtime: {eager_runtime}us") print(f"compiled runtime: {compiled_runtime}us") diff --git a/recipes_source/distributed_optim_torchscript.rst b/recipes_source/distributed_optim_torchscript.rst index 01bc497d38e..dceeb7d78bb 100644 --- a/recipes_source/distributed_optim_torchscript.rst +++ b/recipes_source/distributed_optim_torchscript.rst @@ -2,5 +2,5 @@ TODO(gmagogsfm): Replace/delete this document by 2.9 release. https://github.com/pytorch/tutorials/issues/3456 .. warning:: - TorchScript is deprecated, please use - `torch.export `__ instead. \ No newline at end of file + :term:`TorchScript` is deprecated, please use + `torch.export `__ instead. diff --git a/recipes_source/torchscript_inference.rst b/recipes_source/torchscript_inference.rst index 01bc497d38e..dceeb7d78bb 100644 --- a/recipes_source/torchscript_inference.rst +++ b/recipes_source/torchscript_inference.rst @@ -2,5 +2,5 @@ TODO(gmagogsfm): Replace/delete this document by 2.9 release. https://github.com/pytorch/tutorials/issues/3456 .. warning:: - TorchScript is deprecated, please use - `torch.export `__ instead. \ No newline at end of file + :term:`TorchScript` is deprecated, please use + `torch.export `__ instead.