Triton
Getting Started
Installation
Tutorials
Python API
triton
triton.language
triton.testing
Triton Semantics
Gluon
Overview
Tutorials
Examples
API Reference
Triton MLIR Dialects
Triton MLIR Dialects and Ops
Programming Guide
Introduction
Related Work
Debugging Triton
Floating-Point Sanitizer (FpSan)
Triton
Index
Index
_
|
A
|
B
|
C
|
D
|
E
|
F
|
G
|
H
|
I
|
J
|
L
|
M
|
N
|
P
|
R
|
S
|
T
|
U
|
V
|
W
|
X
|
Z
_
__init__() (triton.Config method)
(triton.experimental.gluon.GluonJITFunction method)
(triton.experimental.gluon.language.amd.AMDMFMALayout method)
(triton.experimental.gluon.language.amd.AMDWMMALayout method)
(triton.experimental.gluon.language.amd.warp_pipeline_stage method)
(triton.experimental.gluon.language.AutoLayout method)
(triton.experimental.gluon.language.BlockedLayout method)
(triton.experimental.gluon.language.CoalescedLayout method)
(triton.experimental.gluon.language.distributed_type method)
(triton.experimental.gluon.language.DistributedLinearLayout method)
(triton.experimental.gluon.language.DotOperandLayout method)
(triton.experimental.gluon.language.nvidia.ampere.mbarrier.MBarrierLayout method)
(triton.experimental.gluon.language.nvidia.blackwell.clc.clc_result method)
(triton.experimental.gluon.language.nvidia.blackwell.tensor_memory_descriptor method)
(triton.experimental.gluon.language.nvidia.blackwell.tensor_memory_descriptor_type method)
(triton.experimental.gluon.language.nvidia.blackwell.TensorMemoryLayout method)
(triton.experimental.gluon.language.nvidia.blackwell.TensorMemoryScalesLayout method)
(triton.experimental.gluon.language.nvidia.blackwell.tma.tensor_descriptor method)
(triton.experimental.gluon.language.nvidia.blackwell.tma.tensor_descriptor_type method)
(triton.experimental.gluon.language.nvidia.hopper.mbarrier.MBarrierLayout method)
(triton.experimental.gluon.language.nvidia.hopper.tma.tensor_descriptor method)
(triton.experimental.gluon.language.nvidia.hopper.tma.tensor_descriptor_im2col method)
(triton.experimental.gluon.language.nvidia.hopper.tma.tensor_descriptor_im2col_type method)
(triton.experimental.gluon.language.nvidia.hopper.tma.tensor_descriptor_type method)
(triton.experimental.gluon.language.NVMMADistributedLayout method)
(triton.experimental.gluon.language.NVMMASharedLayout method)
(triton.experimental.gluon.language.PaddedSharedLayout method)
(triton.experimental.gluon.language.shared_memory_descriptor method)
(triton.experimental.gluon.language.SharedLinearLayout method)
(triton.experimental.gluon.language.SliceLayout method)
(triton.experimental.gluon.language.static_range method)
(triton.experimental.gluon.language.SwizzledSharedLayout method)
(triton.experimental.gluon.language.tensor method)
(triton.experimental.gluon.nvidia.blackwell.TensorDescriptor method)
(triton.experimental.gluon.nvidia.hopper.TensorDescriptor method)
(triton.experimental.gluon.nvidia.hopper.TensorDescriptorIm2Col method)
(triton.language.range method)
(triton.language.static_range method)
(triton.language.tensor method)
(triton.language.tensor_descriptor method)
(triton.testing.Benchmark method)
A
abs() (in module triton.experimental.gluon.language)
(in module triton.language)
add() (in module triton.experimental.gluon.language)
advance() (in module triton.language)
allocate_mbarrier() (in module triton.experimental.gluon.language.nvidia.ampere.mbarrier)
(in module triton.experimental.gluon.language.nvidia.hopper.mbarrier)
allocate_shared_memory() (in module triton.experimental.gluon.language)
allocate_tensor_memory() (in module triton.experimental.gluon.language.nvidia.blackwell)
AMDMFMALayout (class in triton.experimental.gluon.language.amd)
AMDWMMALayout (class in triton.experimental.gluon.language.amd)
arange() (in module triton.experimental.gluon.language)
(in module triton.language)
argmax() (in module triton.language)
argmin() (in module triton.language)
arrive() (in module triton.experimental.gluon.language.nvidia.ampere.mbarrier)
(in module triton.experimental.gluon.language.nvidia.hopper.cluster)
(in module triton.experimental.gluon.language.nvidia.hopper.mbarrier)
assert_close() (in module triton.testing)
associative_scan() (in module triton.experimental.gluon.language)
(in module triton.language)
assume() (in module triton.experimental.gluon.language)
(in module triton.language)
async_atomic_add() (in module triton.experimental.gluon.language.nvidia.blackwell.tma)
(in module triton.experimental.gluon.language.nvidia.hopper.tma)
async_atomic_and() (in module triton.experimental.gluon.language.nvidia.blackwell.tma)
(in module triton.experimental.gluon.language.nvidia.hopper.tma)
async_atomic_max() (in module triton.experimental.gluon.language.nvidia.blackwell.tma)
(in module triton.experimental.gluon.language.nvidia.hopper.tma)
async_atomic_min() (in module triton.experimental.gluon.language.nvidia.blackwell.tma)
(in module triton.experimental.gluon.language.nvidia.hopper.tma)
async_atomic_or() (in module triton.experimental.gluon.language.nvidia.blackwell.tma)
(in module triton.experimental.gluon.language.nvidia.hopper.tma)
async_atomic_xor() (in module triton.experimental.gluon.language.nvidia.blackwell.tma)
(in module triton.experimental.gluon.language.nvidia.hopper.tma)
async_copy_global_to_shared() (in module triton.experimental.gluon.language.nvidia.ampere.async_copy)
(in module triton.experimental.gluon.language.nvidia.blackwell.tma)
(in module triton.experimental.gluon.language.nvidia.hopper.tma)
async_copy_global_to_shared_im2col() (in module triton.experimental.gluon.language.nvidia.hopper.tma)
async_copy_shared_to_global() (in module triton.experimental.gluon.language.nvidia.blackwell.tma)
(in module triton.experimental.gluon.language.nvidia.hopper.tma)
async_gather() (in module triton.experimental.gluon.language.nvidia.blackwell.tma)
async_load() (in module triton.experimental.gluon.language.nvidia.ampere.async_copy)
(in module triton.experimental.gluon.language.nvidia.blackwell.tma)
(in module triton.experimental.gluon.language.nvidia.hopper.tma)
async_load_im2col() (in module triton.experimental.gluon.language.nvidia.blackwell.tma)
(in module triton.experimental.gluon.language.nvidia.hopper.tma)
async_scatter() (in module triton.experimental.gluon.language.nvidia.blackwell.tma)
async_store() (in module triton.experimental.gluon.language.nvidia.blackwell.tma)
(in module triton.experimental.gluon.language.nvidia.hopper.tma)
atomic_add() (in module triton.experimental.gluon.language)
(in module triton.language)
atomic_and() (in module triton.experimental.gluon.language)
(in module triton.language)
atomic_cas() (in module triton.experimental.gluon.language)
(in module triton.language)
atomic_max() (in module triton.experimental.gluon.language)
(in module triton.language)
atomic_min() (in module triton.experimental.gluon.language)
(in module triton.language)
atomic_or() (in module triton.experimental.gluon.language)
(in module triton.language)
atomic_xchg() (in module triton.experimental.gluon.language)
(in module triton.language)
atomic_xor() (in module triton.experimental.gluon.language)
(in module triton.language)
AutoLayout (class in triton.experimental.gluon.language)
autotune() (in module triton)
B
bank_conflicts() (in module triton.experimental.gluon.language)
barrier() (in module triton.experimental.gluon.language)
(in module triton.experimental.gluon.language.nvidia.hopper.cluster)
Benchmark (class in triton.testing)
BlockedLayout (class in triton.experimental.gluon.language)
broadcast() (in module triton.experimental.gluon.language)
(in module triton.language)
broadcast_to() (in module triton.language)
buffer_atomic_add() (in module triton.experimental.gluon.language.amd.cdna3)
(in module triton.experimental.gluon.language.amd.cdna4)
buffer_atomic_and() (in module triton.experimental.gluon.language.amd.cdna3)
(in module triton.experimental.gluon.language.amd.cdna4)
buffer_atomic_max() (in module triton.experimental.gluon.language.amd.cdna3)
(in module triton.experimental.gluon.language.amd.cdna4)
buffer_atomic_min() (in module triton.experimental.gluon.language.amd.cdna3)
(in module triton.experimental.gluon.language.amd.cdna4)
buffer_atomic_or() (in module triton.experimental.gluon.language.amd.cdna3)
(in module triton.experimental.gluon.language.amd.cdna4)
buffer_atomic_xchg() (in module triton.experimental.gluon.language.amd.cdna3)
(in module triton.experimental.gluon.language.amd.cdna4)
buffer_atomic_xor() (in module triton.experimental.gluon.language.amd.cdna3)
(in module triton.experimental.gluon.language.amd.cdna4)
buffer_load() (in module triton.experimental.gluon.language.amd.cdna3)
(in module triton.experimental.gluon.language.amd.cdna4)
buffer_store() (in module triton.experimental.gluon.language.amd.cdna3)
(in module triton.experimental.gluon.language.amd.cdna4)
C
cast() (in module triton.experimental.gluon.language)
(in module triton.language)
cat() (in module triton.language)
cdiv() (in module triton.experimental.gluon.language)
(in module triton.language)
ceil() (in module triton.experimental.gluon.language)
(in module triton.language)
clamp() (in module triton.experimental.gluon.language)
(in module triton.language)
clc_result (class in triton.experimental.gluon.language.nvidia.blackwell.clc)
CoalescedLayout (class in triton.experimental.gluon.language)
commit_group() (in module triton.experimental.gluon.language.nvidia.ampere.async_copy)
Config (class in triton)
constexpr_function() (in module triton.experimental.gluon)
convert_layout() (in module triton.experimental.gluon.language)
cos() (in module triton.experimental.gluon.language)
(in module triton.language)
cumprod() (in module triton.language)
cumsum() (in module triton.language)
D
debug_barrier() (in module triton.language)
device_assert() (in module triton.experimental.gluon.language)
(in module triton.language)
device_print() (in module triton.experimental.gluon.language)
(in module triton.language)
distributed_type (class in triton.experimental.gluon.language)
DistributedLinearLayout (class in triton.experimental.gluon.language)
div_rn() (in module triton.experimental.gluon.language)
(in module triton.language)
do_bench() (in module triton.testing)
do_bench_cudagraph() (in module triton.testing)
do_bench_cudagraph_proton() (in module triton.testing)
do_bench_proton() (in module triton.testing)
dot() (in module triton.language)
dot_fma() (in module triton.experimental.gluon.language)
dot_scaled() (in module triton.language)
DotOperandLayout (class in triton.experimental.gluon.language)
E
erf() (in module triton.experimental.gluon.language)
(in module triton.language)
exp() (in module triton.experimental.gluon.language)
(in module triton.language)
exp2() (in module triton.experimental.gluon.language)
(in module triton.language)
expand_dims() (in module triton.experimental.gluon.language)
(in module triton.language)
expect() (in module triton.experimental.gluon.language.nvidia.hopper.mbarrier)
F
fdiv() (in module triton.experimental.gluon.language)
(in module triton.language)
fence_async_shared() (in module triton.experimental.gluon.language.nvidia.blackwell)
(in module triton.experimental.gluon.language.nvidia.hopper)
fence_init_release_cluster() (in module triton.experimental.gluon.language.nvidia.hopper.mbarrier)
flip() (in module triton.language)
floor() (in module triton.experimental.gluon.language)
(in module triton.language)
fma() (in module triton.experimental.gluon.language)
(in module triton.language)
fp4_to_fp() (in module triton.experimental.gluon.language)
full() (in module triton.experimental.gluon.language)
(in module triton.language)
full_like() (in module triton.experimental.gluon.language)
G
gather() (in module triton.experimental.gluon.language)
(in module triton.language)
gdc_launch_dependents() (in module triton.language.extra.cuda)
gdc_wait() (in module triton.language.extra.cuda)
get_mfma_scale_layout() (in module triton.experimental.gluon.language.amd.cdna4)
GluonJITFunction (class in triton.experimental.gluon)
H
heuristics() (in module triton)
histogram() (in module triton.experimental.gluon.language)
(in module triton.language)
I
init() (in module triton.experimental.gluon.language.nvidia.ampere.mbarrier)
(in module triton.experimental.gluon.language.nvidia.hopper.mbarrier)
inline_asm_elementwise() (in module triton.experimental.gluon.language)
(in module triton.language)
interleave() (in module triton.language)
invalidate() (in module triton.experimental.gluon.language.nvidia.ampere.mbarrier)
(in module triton.experimental.gluon.language.nvidia.hopper.mbarrier)
J
jit() (in module triton)
(in module triton.experimental.gluon)
join() (in module triton.experimental.gluon.language)
(in module triton.language)
L
load() (in module triton.experimental.gluon.language)
(in module triton.language)
load_result() (in module triton.experimental.gluon.language.nvidia.blackwell.clc)
load_tensor_descriptor() (in module triton.language)
log() (in module triton.experimental.gluon.language)
(in module triton.language)
log2() (in module triton.experimental.gluon.language)
(in module triton.language)
M
make_block_ptr() (in module triton.language)
make_tensor_descriptor() (in module triton.experimental.gluon.language.nvidia.blackwell.tma)
(in module triton.experimental.gluon.language.nvidia.hopper.tma)
(in module triton.language)
map_elementwise() (in module triton.experimental.gluon.language)
max() (in module triton.experimental.gluon.language)
(in module triton.language)
max_constancy() (in module triton.experimental.gluon.language)
(in module triton.language)
max_contiguous() (in module triton.experimental.gluon.language)
(in module triton.language)
maximum() (in module triton.experimental.gluon.language)
(in module triton.language)
mbarrier_arrive() (in module triton.experimental.gluon.language.nvidia.ampere.async_copy)
MBarrierLayout (class in triton.experimental.gluon.language.nvidia.ampere.mbarrier)
(class in triton.experimental.gluon.language.nvidia.hopper.mbarrier)
mfma() (in module triton.experimental.gluon.language.amd.cdna3)
(in module triton.experimental.gluon.language.amd.cdna4)
mfma_scaled() (in module triton.experimental.gluon.language.amd.cdna4)
min() (in module triton.experimental.gluon.language)
(in module triton.language)
minimum() (in module triton.experimental.gluon.language)
(in module triton.language)
mma_v2() (in module triton.experimental.gluon.language.nvidia.ampere)
(in module triton.experimental.gluon.language.nvidia.blackwell)
(in module triton.experimental.gluon.language.nvidia.hopper)
module
triton.experimental.gluon.language.amd.cdna4.async_copy
triton.experimental.gluon.language.nvidia.ampere.async_copy
triton.experimental.gluon.language.nvidia.ampere.mbarrier
triton.experimental.gluon.language.nvidia.blackwell.async_copy
triton.experimental.gluon.language.nvidia.blackwell.clc
triton.experimental.gluon.language.nvidia.blackwell.mbarrier
triton.experimental.gluon.language.nvidia.blackwell.tma
triton.experimental.gluon.language.nvidia.hopper.async_copy
triton.experimental.gluon.language.nvidia.hopper.cluster
triton.experimental.gluon.language.nvidia.hopper.mbarrier
triton.experimental.gluon.language.nvidia.hopper.tma
mul() (in module triton.experimental.gluon.language)
multiple_of() (in module triton.experimental.gluon.language)
(in module triton.language)
must_use_result() (in module triton.experimental.gluon)
N
num_ctas() (in module triton.experimental.gluon.language)
num_programs() (in module triton.experimental.gluon.language)
(in module triton.language)
num_warps() (in module triton.experimental.gluon.language)
NVMMADistributedLayout (class in triton.experimental.gluon.language)
NVMMASharedLayout (class in triton.experimental.gluon.language)
P
PaddedSharedLayout (class in triton.experimental.gluon.language)
perf_report() (in module triton.testing)
permute() (in module triton.experimental.gluon.language)
(in module triton.language)
program_id() (in module triton.experimental.gluon.language)
(in module triton.language)
R
rand() (in module triton.language)
randint() (in module triton.language)
randint4x() (in module triton.language)
randn() (in module triton.language)
range (class in triton.language)
ravel() (in module triton.experimental.gluon.language)
(in module triton.language)
reduce() (in module triton.experimental.gluon.language)
(in module triton.language)
reduce_or() (in module triton.experimental.gluon.language)
reshape() (in module triton.experimental.gluon.language)
(in module triton.language)
rsqrt() (in module triton.experimental.gluon.language)
(in module triton.language)
S
set_auto_layout() (in module triton.experimental.gluon.language)
shared_memory_descriptor (class in triton.experimental.gluon.language)
SharedLinearLayout (class in triton.experimental.gluon.language)
sigmoid() (in module triton.language)
sin() (in module triton.experimental.gluon.language)
(in module triton.language)
SliceLayout (class in triton.experimental.gluon.language)
softmax() (in module triton.language)
sort() (in module triton.language)
split() (in module triton.experimental.gluon.language)
(in module triton.language)
sqrt() (in module triton.experimental.gluon.language)
(in module triton.language)
sqrt_rn() (in module triton.experimental.gluon.language)
(in module triton.language)
static_assert() (in module triton.experimental.gluon.language)
(in module triton.language)
static_print() (in module triton.experimental.gluon.language)
(in module triton.language)
static_range (class in triton.experimental.gluon.language)
(class in triton.language)
store() (in module triton.experimental.gluon.language)
(in module triton.language)
store_tensor_descriptor() (in module triton.language)
store_wait() (in module triton.experimental.gluon.language.nvidia.blackwell.tma)
(in module triton.experimental.gluon.language.nvidia.hopper.tma)
sub() (in module triton.experimental.gluon.language)
sum() (in module triton.experimental.gluon.language)
(in module triton.language)
swizzle2d() (in module triton.language)
SwizzledSharedLayout (class in triton.experimental.gluon.language)
T
tensor (class in triton.experimental.gluon.language)
(class in triton.language)
tensor_descriptor (class in triton.experimental.gluon.language.nvidia.blackwell.tma)
(class in triton.experimental.gluon.language.nvidia.hopper.tma)
(class in triton.language)
tensor_descriptor_im2col (class in triton.experimental.gluon.language.nvidia.hopper.tma)
tensor_descriptor_im2col_type (class in triton.experimental.gluon.language.nvidia.hopper.tma)
tensor_descriptor_type (class in triton.experimental.gluon.language.nvidia.blackwell.tma)
(class in triton.experimental.gluon.language.nvidia.hopper.tma)
tensor_memory_descriptor (class in triton.experimental.gluon.language.nvidia.blackwell)
tensor_memory_descriptor_type (class in triton.experimental.gluon.language.nvidia.blackwell)
TensorDescriptor (class in triton.experimental.gluon.nvidia.blackwell)
(class in triton.experimental.gluon.nvidia.hopper)
TensorDescriptorIm2Col (class in triton.experimental.gluon.nvidia.hopper)
TensorMemoryLayout (class in triton.experimental.gluon.language.nvidia.blackwell)
TensorMemoryScalesLayout (class in triton.experimental.gluon.language.nvidia.blackwell)
to_linear_layout() (in module triton.experimental.gluon.language)
to_tensor() (in module triton.experimental.gluon.language)
topk() (in module triton.language)
trans() (in module triton.language)
triton.experimental.gluon.language.amd.cdna4.async_copy
module
triton.experimental.gluon.language.nvidia.ampere.async_copy
module
triton.experimental.gluon.language.nvidia.ampere.mbarrier
module
triton.experimental.gluon.language.nvidia.blackwell.async_copy
module
triton.experimental.gluon.language.nvidia.blackwell.clc
module
triton.experimental.gluon.language.nvidia.blackwell.mbarrier
module
triton.experimental.gluon.language.nvidia.blackwell.tma
module
triton.experimental.gluon.language.nvidia.hopper.async_copy
module
triton.experimental.gluon.language.nvidia.hopper.cluster
module
triton.experimental.gluon.language.nvidia.hopper.mbarrier
module
triton.experimental.gluon.language.nvidia.hopper.tma
module
try_cancel() (in module triton.experimental.gluon.language.nvidia.blackwell.clc)
U
umulhi() (in module triton.experimental.gluon.language)
(in module triton.language)
V
view() (in module triton.language)
W
wait() (in module triton.experimental.gluon.language.nvidia.ampere.mbarrier)
(in module triton.experimental.gluon.language.nvidia.hopper.cluster)
(in module triton.experimental.gluon.language.nvidia.hopper.mbarrier)
wait_group() (in module triton.experimental.gluon.language.nvidia.ampere.async_copy)
warp_pipeline_stage (class in triton.experimental.gluon.language.amd)
warp_specialize() (in module triton.experimental.gluon.language)
warpgroup_mma() (in module triton.experimental.gluon.language.nvidia.hopper)
warpgroup_mma_wait() (in module triton.experimental.gluon.language.nvidia.hopper)
where() (in module triton.experimental.gluon.language)
(in module triton.language)
wmma() (in module triton.experimental.gluon.language.amd.rdna3)
(in module triton.experimental.gluon.language.amd.rdna4)
X
xor_sum() (in module triton.experimental.gluon.language)
(in module triton.language)
Z
zeros() (in module triton.experimental.gluon.language)
(in module triton.language)
zeros_like() (in module triton.experimental.gluon.language)
(in module triton.language)