Fast and memory-efficient exact attention
1051 matches across 14 categories. Click a row to expand file-level details.
| Severity | File | Line | Snippet |
|---|---|---|---|
| MEDIUM | tools/sass_diff.py | 24 | # ── Parsing ────────────────────────────────────────────────────────────────── |
| MEDIUM | tools/sass_diff.py | 92 | # ── Diffing ────────────────────────────────────────────────────────────────── |
| MEDIUM | tools/sass_diff.py | 111 | # ── Display ────────────────────────────────────────────────────────────────── |
| MEDIUM | tools/sass_diff.py | 219 | # ── Main ───────────────────────────────────────────────────────────────────── |
| MEDIUM | tools/ci/run_fa4_ci.py | 27 | # ── GPU helpers ─────────────────────────────────────────────────────────────── |
| MEDIUM | tools/ci/run_fa4_ci.py | 60 | # ── Step plan ───────────────────────────────────────────────────────────────── |
| MEDIUM | tools/ci/run_fa4_ci.py | 103 | # ── Step runner ─────────────────────────────────────────────────────────────── |
| MEDIUM | tools/ci/run_fa4_ci.py | 129 | # ── CLI ─────────────────────────────────────────────────────────────────────── |
| MEDIUM | tests/cute/test_mask_mod_varlen.py | 494 | # ============================================================================= |
| MEDIUM | tests/cute/test_mask_mod_varlen.py | 496 | # ============================================================================= |
| MEDIUM | tests/cute/test_mask_mod_varlen.py | 56 | # ============================================================================= |
| MEDIUM | tests/cute/test_mask_mod_varlen.py | 58 | # ============================================================================= |
| MEDIUM | tests/cute/test_mask_mod_varlen.py | 95 | # ============================================================================= |
| MEDIUM | tests/cute/test_mask_mod_varlen.py | 97 | # ============================================================================= |
| MEDIUM | tests/cute/test_mask_mod_varlen.py | 244 | # ============================================================================= |
| MEDIUM | tests/cute/test_mask_mod_varlen.py | 246 | # ============================================================================= |
| MEDIUM | tests/cute/test_mask_mod_varlen.py | 421 | # ============================================================================= |
| MEDIUM | tests/cute/test_mask_mod_varlen.py | 423 | # ============================================================================= |
| MEDIUM | tests/cute/test_mask_mod_varlen.py | 619 | # ============================================================================= |
| MEDIUM | tests/cute/test_mask_mod_varlen.py | 623 | # ============================================================================= |
| MEDIUM | tests/cute/test_mask_mod_varlen.py | 755 | # ============================================================================= |
| MEDIUM | tests/cute/test_mask_mod_varlen.py | 757 | # ============================================================================= |
| MEDIUM | tests/cute/test_score_mod_varlen.py | 70 | # ============================================================================= |
| MEDIUM | tests/cute/test_score_mod_varlen.py | 72 | # ============================================================================= |
| MEDIUM | tests/cute/test_score_mod_varlen.py | 178 | # ============================================================================= |
| MEDIUM | tests/cute/test_score_mod_varlen.py | 180 | # ============================================================================= |
| MEDIUM | tests/cute/test_score_mod_varlen.py | 401 | # ============================================================================= |
| MEDIUM | tests/cute/test_score_mod_varlen.py | 403 | # ============================================================================= |
| MEDIUM | tests/cute/score_mod_definitions.py | 7 | # ============================================================================= |
| MEDIUM | tests/cute/score_mod_definitions.py | 10 | # ============================================================================= |
| MEDIUM | tests/cute/score_mod_definitions.py | 485 | # ============================================================================= |
| MEDIUM | tests/cute/score_mod_definitions.py | 487 | # ============================================================================= |
| MEDIUM | tests/cute/score_mod_definitions.py | 197 | # ============================================================================= |
| MEDIUM | tests/cute/score_mod_definitions.py | 201 | # ============================================================================= |
| MEDIUM | tests/cute/test_mask_mod.py | 838 | # ============================================================================= |
| MEDIUM | tests/cute/test_mask_mod.py | 842 | # ============================================================================= |
| MEDIUM | tests/cute/test_mask_mod.py | 1245 | # ============================================================================= |
| MEDIUM | tests/cute/test_mask_mod.py | 1247 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 14 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 17 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 19 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 21 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 176 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 180 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 277 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 279 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 340 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 344 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 395 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 397 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 499 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 501 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 603 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 605 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 755 | # ============================================================================= |
| MEDIUM | tests/cute/mask_mod_definitions.py | 757 | # ============================================================================= |
| MEDIUM | tests/cute/test_flash_attn.py | 2725 | # --------------------------------------------------------------------------- |
| MEDIUM | tests/cute/test_flash_attn.py | 2727 | # --------------------------------------------------------------------------- |
| MEDIUM | tests/cute/test_flash_attn.py | 2787 | # --------------------------------------------------------------------------- |
| MEDIUM | tests/cute/test_flash_attn.py | 2789 | # --------------------------------------------------------------------------- |
| 92 more matches not shown… | |||
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | setup.py | 92 | def get_cuda_bare_metal_version(cuda_dir): |
| LOW | setup.py | 201 | def validate_and_update_archs(archs): |
| LOW | csrc/layer_norm/setup.py | 16 | def get_cuda_bare_metal_version(cuda_dir): |
| LOW | csrc/layer_norm/setup.py | 25 | def check_cuda_torch_binary_vs_bare_metal(cuda_dir): |
| LOW | csrc/fused_dense_lib/setup.py | 10 | def get_cuda_bare_metal_version(cuda_dir): |
| LOW | hopper/test_attn_kvcache.py | 155 | def test_flash_attn_kvcache_nosplit(nheads_kv, gqa_ratio, num_requests, query_seqlen, context_seqlen, headdim, causal, g |
| LOW | hopper/test_attn_kvcache.py | 292 | def test_flash_attn_kvcache_output(nheads_kv, gqa_ratio, num_requests, query_seqlen, context_seqlen, headdim, causal, us |
| LOW | hopper/test_torch_compile_and_export.py | 61 | def test_compile_and_package_model(): |
| LOW | hopper/test_flash_attn_triton_amd.py | 334 | def test_flash_attn_varlen_output( |
| LOW | hopper/test_flash_attn_triton_amd.py | 1042 | def test_flash_attn_race_condition(seqlen_q, seqlen_k, d, causal, dtype): |
| LOW | hopper/test_util.py | 9 | def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random", zero_lengths=False): |
| LOW | hopper/setup.py | 325 | def get_cuda_bare_metal_version(cuda_dir): |
| LOW | hopper/test_flash_attn_bwd_determinism.py | 391 | def test_flash_attn_varlen_output( |
| LOW | hopper/test_flash_attn.py | 404 | def test_flash_attn_varlen_output( |
| LOW | hopper/test_flash_attn.py | 1133 | def test_flash_attn_race_condition(seqlen_q, seqlen_k, d, causal, dtype): |
| LOW | hopper/flash_attn_interface.py | 313 | def _flash_attn_backward_fake( |
| LOW | hopper/flash_attn_interface.py | 747 | def flash_attn_qkvpacked_func( |
| LOW | training/src/metrics/num_tokens.py | 39 | def _forward_reduce_state_update(self, *args: Any, **kwargs: Any) -> Any: |
| LOW | training/src/callbacks/speed_monitor.py | 35 | def on_validation_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: |
| LOW | training/src/optim/param_grouping.py | 15 | def group_parameters_for_optimizer(model, optimizer_cfg, bias_weight_decay=False, |
| LOW | training/src/utils/gpu_affinity.py | 57 | def set_single_unique_affinity(gpu_id, nproc_per_node): |
| LOW | training/src/utils/gpu_affinity.py | 80 | def set_socket_unique_affinity(gpu_id, nproc_per_node, mode): |
| LOW | training/src/utils/checkpoint.py | 32 | def blockdiag_to_dense_mlp_bert(state_dict): |
| LOW | training/src/utils/checkpoint.py | 41 | def interpolate_pos_embedding(state_dict, out_seqlen, pos_embedding_name='model.pos_encoder.pe', interleave=False): |
| LOW | training/src/utils/ddp_zero1.py | 24 | def get_zero_optimizer_state_dict_local(optimizer, global_rank): |
| LOW | tests/test_flash_attn_triton_amd.py | 44 | def attn_bias_from_alibi_slopes( |
| LOW | tests/test_flash_attn_triton_amd.py | 73 | def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"): |
| LOW | tests/test_flash_attn_triton_amd.py | 397 | def attention_blocksparse_ref(qkv, blockmask, attn_mask, dropout_p, dropout_mask): |
| LOW | tests/test_flash_attn_triton_amd.py | 601 | def test_flash_attn_qkvpacked(seqlen, d, dropout_p, causal, local, alibi, deterministic, dtype): |
| LOW | tests/test_flash_attn_triton_amd.py | 748 | def test_flash_attn_varlen_qkvpacked( |
| LOW | tests/test_flash_attn_triton_amd.py | 1191 | def test_flash_attn_varlen_output( |
| LOW | tests/test_flash_attn_triton_amd.py | 1619 | def test_flash_attn_varlen_causal( |
| LOW | tests/test_flash_attn_triton_amd.py | 2230 | def test_flash_attn_race_condition(seqlen_q, seqlen_k, d, dropout_p, causal, dtype): |
| LOW | tests/test_flash_attn_triton_amd.py | 2279 | def test_flash_attn_bwd_overflow(seqlen, d, causal, dtype): |
| LOW | tests/test_flash_attn_triton_amd.py | 2336 | def test_flash_attn_bwd_transpose(seqlen, d, causal, dtype): |
| LOW | tests/test_flash_attn_triton_amd.py | 2389 | def test_flash_attn_bwd_varlen_overflow(d, causal, dtype): |
| LOW | tests/test_flash_attn_triton_amd.py | 2448 | def test_flash_attn_deterministic(seqlen_q, seqlen_k, swap_sq_sk, d, causal, local, dtype): |
| LOW | tests/test_flash_attn_triton_amd.py | 2507 | def test_flash_attn_varlen_deterministic(seqlen_q, seqlen_k, swap_sq_sk, d, causal, local, dtype): |
| LOW | tests/test_rotary.py | 229 | def test_rotary_emb_varlen_func(inplace, interleaved, rotary_fraction, seqlen_offsets_type, dtype): |
| LOW | tests/test_flash_attn_ck.py | 72 | def get_bwd_unsupported_reason(d, deterministic): |
| LOW | tests/test_flash_attn_ck.py | 85 | def ck_randval_to_dropout_mask(randval, p): |
| LOW | tests/test_flash_attn_ck.py | 92 | def pad_rearrange_dropout_mask_hts_to_bhss(S_dmask, cu_seqlens_q, seqlen_q_rounded, seqlen_k_rounded): |
| LOW | tests/test_flash_attn_ck.py | 124 | def test_flash_attn_qkvpacked(seqlen, d, dropout_p, causal, local, alibi, deterministic, dtype): |
| LOW | tests/test_flash_attn_ck.py | 222 | def test_flash_attn_varlen_qkvpacked(seqlen, d, dropout_p, causal, local, alibi, deterministic, dtype): |
| LOW | tests/test_flash_attn_ck.py | 573 | def test_flash_attn_varlen_output( |
| LOW | tests/test_flash_attn_ck.py | 931 | def test_flash_attn_varlen_causal( |
| LOW | tests/test_flash_attn_ck.py | 1363 | def test_flash_attn_race_condition(seqlen_q, seqlen_k, d, dropout_p, causal, dtype): |
| LOW | tests/test_flash_attn_ck.py | 1410 | def test_flash_attn_bwd_overflow(seqlen, d, causal, dtype): |
| LOW | tests/test_flash_attn_ck.py | 1469 | def test_flash_attn_bwd_transpose(seqlen, d, causal, dtype): |
| LOW | tests/test_flash_attn_ck.py | 1522 | def test_flash_attn_bwd_varlen_overflow(d, causal, dtype): |
| LOW | tests/test_flash_attn_ck.py | 1558 | def test_flash_attn_bwd_varlen_seqq_zero(d, causal, nheads_kv, deterministic, dtype): |
| LOW | tests/test_flash_attn_ck.py | 1619 | def test_flash_attn_deterministic(seqlen_q, seqlen_k, swap_sq_sk, d, causal, local, dtype): |
| LOW | tests/test_flash_attn_ck.py | 1670 | def test_flash_attn_varlen_deterministic(seqlen_q, seqlen_k, swap_sq_sk, d, causal, local, dtype): |
| LOW | tests/test_util.py | 8 | def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random", zero_lengths=False): |
| LOW | tests/test_flash_attn.py | 29 | def attn_bias_from_alibi_slopes( |
| LOW | tests/test_flash_attn.py | 58 | def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"): |
| LOW | tests/test_flash_attn.py | 382 | def attention_blocksparse_ref(qkv, blockmask, attn_mask, dropout_p, dropout_mask): |
| LOW | tests/test_flash_attn.py | 586 | def test_flash_attn_qkvpacked(seqlen, d, dropout_p, causal, local, alibi, deterministic, dtype): |
| LOW | tests/test_flash_attn.py | 733 | def test_flash_attn_varlen_qkvpacked( |
| LOW | tests/test_flash_attn.py | 1172 | def test_flash_attn_varlen_output( |
| 271 more matches not shown… | |||
| Severity | File | Line | Snippet |
|---|---|---|---|
| HIGH | hopper/test_attn_kvcache.py | 0 | arguments: q: (batch_size, seqlen_q, nheads, head_dim) k: (batch_size, seqlen_k, nheads_k, head_dim) v: (batch_size, seq |
| HIGH | tests/test_flash_attn_triton_amd.py | 0 | arguments: q: (batch_size, seqlen_q, nheads, head_dim) k: (batch_size, seqlen_k, nheads_k, head_dim) v: (batch_size, seq |
| HIGH | tests/test_util.py | 0 | arguments: q: (batch_size, seqlen_q, nheads, head_dim) k: (batch_size, seqlen_k, nheads_k, head_dim) v: (batch_size, seq |
| HIGH | tests/test_flash_attn.py | 0 | arguments: q: (batch_size, seqlen_q, nheads, head_dim) k: (batch_size, seqlen_k, nheads_k, head_dim) v: (batch_size, seq |
| HIGH | hopper/test_kvcache.py | 0 | use pytorch benchmark on the forward pass of an arbitrary function. |
| HIGH | benchmarks/benchmark_gemm.py | 0 | use pytorch benchmark on the forward pass of an arbitrary function. |
| HIGH | flash_attn/cute/benchmark.py | 0 | use pytorch benchmark on the forward pass of an arbitrary function. |
| HIGH | flash_attn/utils/benchmark.py | 0 | use pytorch benchmark on the forward pass of an arbitrary function. |
| HIGH | hopper/benchmark_flash_attention_fp8.py | 0 | arguments: qkv: (batch_size, seqlen, 3, nheads, head_dim) dropout_p: float output: output: (batch_size, seqlen, nheads, |
| HIGH | benchmarks/benchmark_causal.py | 0 | arguments: qkv: (batch_size, seqlen, 3, nheads, head_dim) dropout_p: float output: output: (batch_size, seqlen, nheads, |
| HIGH | benchmarks/benchmark_flash_attention.py | 0 | arguments: qkv: (batch_size, seqlen, 3, nheads, head_dim) dropout_p: float output: output: (batch_size, seqlen, nheads, |
| HIGH | training/src/callbacks/wandb_callbacks.py | 0 | start executing this callback only after all validation sanity checks end. |
| HIGH | training/src/callbacks/wandb_callbacks.py | 0 | start executing this callback only after all validation sanity checks end. |
| HIGH | training/src/callbacks/wandb_callbacks.py | 0 | start executing this callback only after all validation sanity checks end. |
| HIGH | training/src/models/modules/seq_common.py | 0 | hidden_states: (b, s, d) if batch_first else (s, b, d) |
| HIGH | training/src/models/modules/seq_common.py | 0 | hidden_states: (b, s, d) if batch_first else (s, b, d) |
| HIGH | training/src/models/modules/seq_common.py | 0 | hidden_states: (b, s, d) if batch_first else (s, b, d) |
| HIGH | training/src/models/modules/seq_common.py | 0 | hidden_states: (b, s, d) if batch_first else (s, b, d) |
| HIGH | tests/test_flash_attn_triton_amd.py | 0 | arguments: q: (batch_size, seqlen_q, nheads, d) k: (batch_size, seqlen_k, nheads_k, d) v: (batch_size, seqlen_k, nheads_ |
| HIGH | tests/test_util.py | 0 | arguments: q: (batch_size, seqlen_q, nheads, d) k: (batch_size, seqlen_k, nheads_k, d) v: (batch_size, seqlen_k, nheads_ |
| HIGH | tests/test_flash_attn.py | 0 | arguments: q: (batch_size, seqlen_q, nheads, d) k: (batch_size, seqlen_k, nheads_k, d) v: (batch_size, seqlen_k, nheads_ |
| HIGH | tests/test_flash_attn_triton_amd.py | 0 | we previously had a bug where not masking elements beyond seqlen_k caused nan in dq, in the case where seqlen % 128 != 0 |
| HIGH | tests/test_flash_attn_ck.py | 0 | we previously had a bug where not masking elements beyond seqlen_k caused nan in dq, in the case where seqlen % 128 != 0 |
| HIGH | tests/test_flash_attn.py | 0 | we previously had a bug where not masking elements beyond seqlen_k caused nan in dq, in the case where seqlen % 128 != 0 |
| HIGH | tests/test_flash_attn_triton_amd.py | 0 | we previously had a bug where we were using the wrong strides of dout, which shows up when dout is not contiguous. |
| HIGH | tests/test_flash_attn_ck.py | 0 | we previously had a bug where we were using the wrong strides of dout, which shows up when dout is not contiguous. |
| HIGH | tests/test_flash_attn.py | 0 | we previously had a bug where we were using the wrong strides of dout, which shows up when dout is not contiguous. |
| HIGH | tests/test_flash_attn_triton_amd.py | 0 | we previously had a bug where not masking elements beyond seqlen_k caused nan in dq, in the case where seqlen % 128 != 0 |
| HIGH | tests/test_flash_attn_ck.py | 0 | we previously had a bug where not masking elements beyond seqlen_k caused nan in dq, in the case where seqlen % 128 != 0 |
| HIGH | tests/test_flash_attn.py | 0 | we previously had a bug where not masking elements beyond seqlen_k caused nan in dq, in the case where seqlen % 128 != 0 |
| HIGH | tests/cute/test_score_mod_varlen.py | 0 | tests equality between original and vectorized versions of score mods |
| HIGH | tests/cute/test_score_mod.py | 0 | tests equality between original and vectorized versions of score mods |
| HIGH | tests/cute/test_score_mod.py | 0 | tests equality between original and vectorized versions of score mods |
| HIGH | tests/models/test_llama.py | 0 | check that our implementation matches the hf implementation: the scores in fp16 should be around the same as the hf scor |
| HIGH | tests/models/test_falcon.py | 0 | check that our implementation matches the hf implementation: the scores in fp16 should be around the same as the hf scor |
| HIGH | tests/models/test_baichuan.py | 0 | check that our implementation matches the hf implementation: the scores in fp16 should be around the same as the hf scor |
| HIGH | flash_attn/cute/benchmark.py | 0 | use pytorch benchmark on the forward+backward pass of an arbitrary function. |
| HIGH | flash_attn/cute/benchmark.py | 0 | use pytorch benchmark on the forward+backward pass of an arbitrary function. |
| HIGH | flash_attn/cute/benchmark.py | 0 | use pytorch benchmark on the forward+backward pass of an arbitrary function. |
| HIGH | flash_attn/utils/benchmark.py | 0 | use pytorch benchmark on the forward+backward pass of an arbitrary function. |
| HIGH | flash_attn/utils/benchmark.py | 0 | use pytorch benchmark on the forward+backward pass of an arbitrary function. |
| HIGH | flash_attn/utils/benchmark.py | 0 | use pytorch benchmark on the forward+backward pass of an arbitrary function. |
| HIGH | flash_attn/cute/flash_fwd.py | 0 | configures and launches the flash attention kernel. mq/mk/mv/mo has same data types(supports fp16 and bf16) and same lay |
| HIGH | flash_attn/cute/flash_fwd.py | 0 | configures and launches the flash attention kernel. mq/mk/mv/mo has same data types(supports fp16 and bf16) and same lay |
| HIGH | flash_attn/cute/flash_fwd_sm90.py | 0 | configures and launches the flash attention kernel. mq/mk/mv/mo has same data types(supports fp16 and bf16) and same lay |
| HIGH | flash_attn/ops/rms_norm.py | 0 | residual_in_fp32 only has an effect if residual is none. otherwise residual dtype is residual.dtype. |
| HIGH | flash_attn/ops/rms_norm.py | 0 | residual_in_fp32 only has an effect if residual is none. otherwise residual dtype is residual.dtype. |
| HIGH | flash_attn/ops/rms_norm.py | 0 | residual_in_fp32 only has an effect if residual is none. otherwise residual dtype is residual.dtype. |
| HIGH | flash_attn/ops/layer_norm.py | 0 | residual_in_fp32 only has an effect if residual is none. otherwise residual dtype is residual.dtype. |
| HIGH | flash_attn/ops/layer_norm.py | 0 | residual_in_fp32 only has an effect if residual is none. otherwise residual dtype is residual.dtype. |
| HIGH | flash_attn/ops/layer_norm.py | 0 | residual_in_fp32 only has an effect if residual is none. otherwise residual dtype is residual.dtype. |
| HIGH | flash_attn/ops/layer_norm.py | 0 | assume that arguments are contiguous and aligned to 16 bytes |
| HIGH | flash_attn/ops/layer_norm.py | 0 | assume that arguments are contiguous and aligned to 16 bytes |
| HIGH | flash_attn/ops/layer_norm.py | 0 | assume that arguments are contiguous and aligned to 16 bytes |
| HIGH | flash_attn/modules/mha.py | 0 | implement the scaled dot product attention with softmax. arguments --------- softmax_scale: the temperature to use for t |
| HIGH | flash_attn/modules/mha.py | 0 | implement the scaled dot product attention with softmax. arguments --------- softmax_scale: the temperature to use for t |
| HIGH | flash_attn/modules/mha.py | 0 | implement the scaled dot product attention with softmax. arguments --------- softmax_scale: the temperature to use for t |
| HIGH | flash_attn/modules/mha.py | 0 | implement the scaled dot product attention with softmax. arguments --------- softmax_scale: the temperature to use for t |
| HIGH | flash_attn/modules/mha.py | 0 | kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim) |
| HIGH | flash_attn/modules/mha.py | 0 | kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim) |
| 1 more matches not shown… | |||
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | setup.py | 25 | |
| LOW | csrc/layer_norm/setup.py | 2 | |
| LOW | csrc/layer_norm/setup.py | 3 | |
| LOW | csrc/layer_norm/setup.py | 8 | |
| LOW | csrc/layer_norm/setup.py | 9 | |
| LOW | csrc/fused_dense_lib/setup.py | 5 | |
| LOW | tools/ci/run_fa4_ci.py | 7 | |
| LOW | hopper/benchmark_mla_decode.py | 11 | |
| LOW | hopper/test_attn_kvcache.py | 4 | |
| LOW | hopper/test_attn_kvcache.py | 6 | |
| LOW | hopper/test_attn_kvcache.py | 8 | |
| LOW | hopper/test_kvcache.py | 9 | |
| LOW | hopper/benchmark_flash_attention_fp8.py | 3 | |
| LOW | hopper/benchmark_flash_attention_fp8.py | 7 | |
| LOW | hopper/benchmark_flash_attention_fp8.py | 10 | |
| LOW | hopper/benchmark_flash_attention_fp8.py | 12 | |
| LOW | hopper/benchmark_flash_attention_fp8.py | 12 | |
| LOW | hopper/benchmark_flash_attention_fp8.py | 13 | |
| LOW | hopper/benchmark_flash_attention_fp8.py | 13 | |
| LOW | hopper/benchmark_flash_attention_fp8.py | 15 | |
| LOW | hopper/benchmark_flash_attention_fp8.py | 16 | |
| LOW | hopper/test_flash_attn_triton_amd.py | 7 | |
| LOW | hopper/benchmark_attn.py | 1 | |
| LOW | hopper/benchmark_attn.py | 2 | |
| LOW | hopper/benchmark_attn.py | 7 | |
| LOW | hopper/benchmark_attn.py | 8 | |
| LOW | hopper/benchmark_attn.py | 21 | |
| LOW | hopper/benchmark_attn.py | 24 | |
| LOW | hopper/benchmark_attn.py | 24 | |
| LOW | hopper/benchmark_attn.py | 24 | |
| LOW | hopper/benchmark_attn.py | 24 | |
| LOW | hopper/benchmark_attn.py | 24 | |
| LOW | hopper/setup.py | 25 | |
| LOW | hopper/test_flash_attn_bwd_determinism.py | 2 | |
| LOW | hopper/test_flash_attn_bwd_determinism.py | 7 | |
| LOW | hopper/test_flash_attn_bwd_determinism.py | 8 | |
| LOW | hopper/test_flash_attn_bwd_determinism.py | 10 | |
| LOW | hopper/test_flash_attn_bwd_determinism.py | 16 | |
| LOW | hopper/test_flash_attn_bwd_determinism.py | 16 | |
| LOW | hopper/test_flash_attn_bwd_determinism.py | 23 | |
| LOW | hopper/test_flash_attn_bwd_determinism.py | 24 | |
| LOW | hopper/test_flash_attn_bwd_determinism.py | 24 | |
| LOW | hopper/test_flash_attn.py | 7 | |
| LOW | hopper/flash_attn_interface.py | 3 | |
| LOW | hopper/flash_attn_interface.py | 7 | |
| LOW | hopper/benchmark_split_kv.py | 5 | |
| LOW | training/tests/datamodules/test_language_modeling_hf.py | 6 | |
| LOW | training/src/eval.py | 1 | |
| LOW | training/src/eval.py | 8 | |
| LOW | training/src/metrics/perplexity.py | 9 | |
| LOW | training/src/metrics/accuracy.py | 1 | |
| LOW | training/src/metrics/accuracy.py | 4 | |
| LOW | training/src/tasks/seq.py | 1 | |
| LOW | training/src/tasks/seq.py | 4 | |
| LOW | training/src/distributed/ddp_comm_hooks.py | 3 | |
| LOW | training/src/distributed/ddp_comm_hooks.py | 3 | |
| LOW | training/src/callbacks/flop_count.py | 2 | |
| LOW | training/src/callbacks/flop_count.py | 8 | |
| LOW | training/src/callbacks/causality_monitor.py | 2 | |
| LOW | training/src/callbacks/ema.py | 8 | |
| 140 more matches not shown… | |||
| Severity | File | Line | Snippet |
|---|---|---|---|
| CRITICAL | hopper/test_flash_attn_triton_amd.py | 1140 | assert torch.ops.flash_attn_3.fwd.default._schema.is_backward_compatible_with(parse_schema( |
| CRITICAL | hopper/test_flash_attn_triton_amd.py | 1153 | assert torch.ops.flash_attn_3.bwd.default._schema.is_backward_compatible_with(parse_schema( |
| CRITICAL | hopper/test_flash_attn_triton_amd.py | 1161 | assert torch.ops.flash_attn_3.fwd_combine.default._schema.is_backward_compatible_with(parse_schema( |
| CRITICAL | hopper/test_flash_attn_triton_amd.py | 1165 | assert torch.ops.flash_attn_3.get_scheduler_metadata.default._schema.is_backward_compatible_with(parse_schema( |
| CRITICAL | hopper/test_flash_attn.py | 1230 | assert torch.ops.flash_attn_3.fwd.default._schema.is_backward_compatible_with(parse_schema( |
| CRITICAL | hopper/test_flash_attn.py | 1243 | assert torch.ops.flash_attn_3.bwd.default._schema.is_backward_compatible_with(parse_schema( |
| CRITICAL | hopper/test_flash_attn.py | 1251 | assert torch.ops.flash_attn_3.fwd_combine.default._schema.is_backward_compatible_with(parse_schema( |
| CRITICAL | hopper/test_flash_attn.py | 1255 | assert torch.ops.flash_attn_3.get_scheduler_metadata.default._schema.is_backward_compatible_with(parse_schema( |
| CRITICAL | tests/models/test_btlm.py | 214 | assert model.transformer.embeddings.word_embeddings.weight.mean().abs() < 1e-4 |
| CRITICAL | tests/models/test_btlm.py | 216 | model.transformer.embeddings.word_embeddings.weight.std() |
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | setup.py | 101 | |
| LOW | tools/sass_diff.py | 128 | |
| LOW | hopper/benchmark_flash_attention_fp8.py | 34 | |
| LOW | hopper/test_flash_attn_triton_amd.py | 628 | |
| LOW | hopper/benchmark_attn.py | 76 | |
| LOW | hopper/setup.py | 125 | |
| LOW | hopper/test_flash_attn_bwd_determinism.py | 110 | |
| LOW | hopper/test_flash_attn_bwd_determinism.py | 391 | |
| LOW | hopper/test_flash_attn.py | 715 | |
| LOW | hopper/benchmark_split_kv.py | 35 | |
| LOW | training/src/train.py | 32 | |
| LOW | training/src/callbacks/norm_monitor.py | 33 | |
| LOW | training/src/optim/param_grouping.py | 15 | |
| LOW | training/src/utils/gpu_affinity.py | 80 | |
| LOW | training/src/utils/gpu_affinity.py | 127 | |
| LOW | training/src/utils/ema.py | 228 | |
| LOW | training/src/utils/distributed.py | 70 | |
| LOW | training/src/models/modules/seq_common.py | 15 | |
| LOW | tests/cute/benchmark_mask_mod.py | 154 | |
| LOW | tests/cute/benchmark_mask_mod.py | 448 | |
| LOW | tests/cute/test_mask_mod_varlen.py | 249 | |
| LOW | tests/cute/test_mask_mod_varlen.py | 903 | |
| LOW | tests/cute/test_flash_attn_race_condition.py | 391 | |
| LOW | tests/cute/test_score_mod_varlen.py | 602 | |
| LOW | tests/cute/test_score_mod_varlen.py | 950 | |
| LOW | tests/cute/test_mask_mod.py | 1905 | |
| LOW | tests/cute/test_mask_mod.py | 1951 | |
| LOW | tests/cute/test_flash_attn.py | 125 | |
| LOW | tests/cute/test_flash_attn.py | 563 | |
| LOW | tests/cute/test_flash_attn.py | 1070 | |
| LOW | tests/cute/test_flash_attn.py | 2375 | |
| LOW | tests/cute/test_block_sparsity.py | 43 | |
| LOW | tests/cute/test_block_sparsity.py | 484 | |
| LOW | benchmarks/tune_ex2_emu.py | 33 | |
| LOW | benchmarks/tune_ex2_emu.py | 225 | |
| LOW | benchmarks/benchmark_attn.py | 361 | |
| LOW | benchmarks/bench_sm90.py | 334 | |
| LOW | benchmarks/bench_sm90.py | 367 | |
| LOW | benchmarks/bench_sm90.py | 397 | |
| LOW | benchmarks/bench_sm90.py | 452 | |
| LOW | benchmarks/bench_sm90.py | 489 | |
| LOW | flash_attn/flash_attn_triton.py | 66 | |
| LOW | flash_attn/flash_attn_triton.py | 365 | |
| LOW | flash_attn/flash_attn_interface.py | 31 | |
| LOW | flash_attn/cute/sm90_config_search.py | 174 | |
| LOW | flash_attn/cute/sm90_config_search.py | 315 | |
| LOW | flash_attn/cute/mask.py | 44 | |
| LOW | flash_attn/cute/mask.py | 145 | |
| LOW | flash_attn/cute/mask.py | 463 | |
| LOW | flash_attn/cute/mask.py | 580 | |
| LOW | flash_attn/cute/mask.py | 742 | |
| LOW | flash_attn/cute/mask.py | 1406 | |
| LOW | flash_attn/cute/mask.py | 1548 | |
| LOW | flash_attn/cute/mask.py | 1623 | |
| LOW | flash_attn/cute/sm100_hd256_2cta_fmha_forward.py | 554 | |
| LOW | flash_attn/cute/sm100_hd256_2cta_fmha_forward.py | 1525 | |
| LOW | flash_attn/cute/ampere_helpers.py | 35 | |
| LOW | flash_attn/cute/flash_bwd.py | 481 | |
| LOW | flash_attn/cute/flash_bwd.py | 1165 | |
| LOW | flash_attn/cute/flash_bwd.py | 1194 | |
| 59 more matches not shown… | |||
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | csrc/flash_attn_ck/flash_common.hpp | 1 | /****************************************************************************** |
| LOW | csrc/layer_norm/static_switch.h | 1 | // Inspired by https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h |
| LOW | csrc/fused_dense_lib/fused_dense.cpp | 1 | // Adapted from https://github.com/NVIDIA/apex/blob/master/csrc/fused_dense.cpp |
| LOW | csrc/flash_attn/flash_api.cpp | 1 | /****************************************************************************** |
| LOW | csrc/flash_attn/src/flash_fwd_kernel.h | 1 | /****************************************************************************** |
| LOW | csrc/flash_attn/src/flash_fwd_kernel.h | 221 | // for (int i = 0; i < size(tScQ); ++i) { |
| LOW | csrc/flash_attn/src/utils.h | 1 | /****************************************************************************** |
| LOW | csrc/flash_attn/src/utils.h | 321 | cute::clear(D(_, m, _)); |
| LOW | csrc/flash_attn/src/utils.h | 341 | // if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN) { |
| LOW | csrc/flash_attn/src/flash_fwd_launch_template.h | 1 | /****************************************************************************** |
| LOW | csrc/flash_attn/src/flash_bwd_launch_template.h | 1 | /****************************************************************************** |
| LOW | csrc/flash_attn/src/flash_bwd_launch_template.h | 21 | #define KERNEL_PARAM_MODIFIER __grid_constant__ |
| LOW | csrc/flash_attn/src/flash_bwd_launch_template.h | 181 | // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, true, false, T>, Is_dropout>(params, |
| LOW | csrc/flash_attn/src/flash_bwd_launch_template.h | 241 | // printf("max_smem_per_block = %d\n", max_smem_per_block); |
| LOW | csrc/flash_attn/src/dropout.h | 41 | uint2 rowcol = make_uint2(block_row_start, block_col_start); |
| LOW | csrc/flash_attn/src/flash_bwd_kernel.h | 1 | /*************************************************************************************************** |
| LOW | csrc/flash_attn/src/flash_bwd_kernel.h | 321 | // If not local, we're guaranteed that m_block_min <= m_block: |
| LOW | csrc/flash_attn/src/static_switch.h | 1 | // Inspired by |
| LOW | tools/ci/build_sif.sh | 1 | #!/usr/bin/env bash |
| LOW | hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp | 1 | /****************************************************************************** |
| LOW | hopper/utils.h | 1 | /****************************************************************************** |
| LOW | hopper/mainloop_bwd_sm90_tma_gmma_ws.hpp | 1 | /****************************************************************************** |
| LOW | hopper/benchmark_mla_decode.py | 121 | print(f"Arithmetic intensity: {flops / mem_io:.1f}") |
| LOW | hopper/mainloop_bwd_sm80.hpp | 1 | /****************************************************************************** |
| LOW | hopper/mainloop_bwd_sm80.hpp | 561 | #pragma unroll |
| LOW | hopper/mainloop_bwd_sm80.hpp | 621 | // Instead of passing in tQcQ, we pass in t0QcQ and subtract the offset from the limit |
| LOW | hopper/flash_fwd_launch_template.h | 1 | /****************************************************************************** |
| LOW | hopper/benchmark_flash_attention_fp8.py | 321 | # )() |
| LOW | hopper/test_flash_attn_triton_amd.py | 61 | @pytest.mark.parametrize("deterministic", [False]) |
| LOW | hopper/test_flash_attn_triton_amd.py | 221 | and dtype != torch.float8_e4m3fn |
| LOW | hopper/test_flash_attn_triton_amd.py | 241 | # causal, |
| LOW | hopper/test_flash_attn_triton_amd.py | 501 | ): |
| LOW | hopper/test_flash_attn_triton_amd.py | 581 | # @pytest.mark.parametrize("mha_type", ["mha"]) |
| LOW | hopper/test_flash_attn_triton_amd.py | 901 | out = output_pad_fn(out) |
| LOW | hopper/tile_scheduler.hpp | 641 | // Total number of blocks for the next 31 batches |
| LOW | hopper/tile_scheduler.hpp | 741 | int split_idx = bidh - bidh_actual * num_splits; |
| LOW | hopper/benchmark_attn.py | 41 | def time_fwd(func, *args, repeats=30, verbose=True, desc="", **kwargs): |
| LOW | hopper/benchmark_attn.py | 241 | # bs_seqlen_vals = [(32, 512), (16, 1024)] |
| LOW | hopper/benchmark_attn.py | 401 | # print(time_f) |
| LOW | hopper/flash_bwd_launch_template.h | 1 | /****************************************************************************** |
| LOW | hopper/test_flash_attn_bwd_determinism.py | 61 | # @pytest.mark.parametrize("mha_type", ["mqa"]) |
| LOW | hopper/test_flash_attn_bwd_determinism.py | 341 | # @pytest.mark.parametrize("dtype", [torch.float8_e4m3fn]) |
| LOW | hopper/test_flash_attn_bwd_determinism.py | 401 | # batch_size = 40 |
| LOW | hopper/static_switch.h | 1 | // Inspired by |
| LOW | hopper/mainloop_fwd_sm80.hpp | 1 | /****************************************************************************** |
| LOW | hopper/test_flash_attn.py | 121 | # @pytest.mark.parametrize("has_qv", [True]) |
| LOW | hopper/test_flash_attn.py | 301 | # k, |
| LOW | hopper/test_flash_attn.py | 361 | @pytest.mark.parametrize("softcap", [0.0] + ([15.0] if not DISABLE_SOFTCAP else [])) |
| LOW | hopper/test_flash_attn.py | 601 | # None, |
| LOW | hopper/test_flash_attn.py | 621 | dv.masked_fill_(k_zero_masking, 0.0) |
| LOW | hopper/flash_fwd_kernel_sm90.h | 1 | /****************************************************************************** |
| LOW | hopper/flash_api.cpp | 1221 | #ifndef FLASHATTENTION_DISABLE_HDIM256 |
| LOW | hopper/benchmark_split_kv.py | 121 | causal=causal, |
| LOW | hopper/flash_api_stable.cpp | 1 | /****************************************************************************** |
| LOW | hopper/flash_api_stable.cpp | 541 | #endif |
| LOW | hopper/flash_api_stable.cpp | 1241 | if (out_type == torch::headeronly::ScalarType::BFloat16) { |
| LOW | hopper/flash_api_stable.cpp | 1281 | if (params.d_rounded == 64) { return run_mha_bwd_<Arch, cutlass::half_t, 64, Has_softcap>(params, stream); } |
| LOW | hopper/flash_api_stable.cpp | 1301 | #endif |
| LOW | training/configs/experiment/owt/gpt2xl-flash.yaml | 1 | # @package _global_ |
| LOW | training/configs/experiment/owt/gpt2l-flash.yaml | 1 | # @package _global_ |
| 35 more matches not shown… | |||
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | hopper/flash_fwd_combine_kernel.h | 229 | // Step 1: load LSE_partial from gmem -> smem |
| LOW | hopper/flash_fwd_combine_kernel.h | 274 | // Step 2: Load O_partial from gmem -> smem for split = 0, 1, ..., kStages - 2. |
| LOW | hopper/flash_fwd_combine_kernel.h | 335 | // Step 3: load and transpose LSE_partial from smem -> rmem |
| LOW | hopper/flash_fwd_combine_kernel.h | 345 | // Step 4: compute the final LSE along the split dimension |
| LOW | hopper/flash_fwd_combine_kernel.h | 394 | // Step 5: store final LSE back to gmem |
| LOW | hopper/flash_fwd_combine_kernel.h | 417 | // Step 6: read O_partial from gmem -> smem -> rmem and accumulate the final O |
| LOW | hopper/flash_fwd_combine_kernel.h | 460 | // Step 7: Write the final O to gmem |
| LOW | hopper/flash_bwd_postprocess_kernel.h | 174 | // Step 1: load dQaccum from gmem to smem |
| LOW | hopper/flash_bwd_postprocess_kernel.h | 200 | // Step 2: Load dQaccum from smem to register, then convert fp32 -> fp16/bf16 |
| LOW | hopper/flash_bwd_postprocess_kernel.h | 218 | // Step 3: Copy dQ from register to smem |
| LOW | hopper/flash_bwd_postprocess_kernel.h | 229 | // Step 4: Copy dQ from smem to register to prepare for coalesced write to gmem |
| LOW | hopper/flash_bwd_postprocess_kernel.h | 247 | // Step 5: Copy dQ from register to gmem |
| LOW | hopper/epilogue_fwd.hpp | 251 | // Step 1: Write O from rmem -> smem |
| LOW | hopper/epilogue_fwd.hpp | 281 | // Step 2: Write LSE from rmem -> gmem |
| LOW | hopper/epilogue_fwd.hpp | 310 | // Step 3: Write O from smem -> gmem |
| LOW | flash_attn/cute/flash_bwd.py | 251 | # Do we need to check if we overshot kBlockM when we load Q? |
| LOW | flash_attn/cute/flash_bwd.py | 253 | # Do we need to check if we overshot kBlockN when we load K? |
| LOW | flash_attn/cute/flash_bwd.py | 260 | # Do we need to check if we overshot kBlockN when we load V? |
| LOW | flash_attn/cute/flash_bwd_postprocess.py | 492 | # Step 1: load dQaccum from gmem to smem |
| LOW | flash_attn/cute/flash_bwd_postprocess.py | 501 | # Step 2: load dQ from smem to rmem |
| LOW | flash_attn/cute/flash_bwd_postprocess.py | 534 | # Step 3: Copy dQ from register to smem |
| LOW | flash_attn/cute/flash_bwd_postprocess.py | 568 | # Step 4: Copy dQ from smem to register to prepare for coalesced write to gmem |
| LOW | flash_attn/cute/flash_bwd_postprocess.py | 577 | # Step 5: Copy dQ from register to gmem |
| LOW | flash_attn/cute/flash_fwd.py | 495 | # Do we need to check if we overshoot kBlockN when we load K? |
| LOW | flash_attn/cute/flash_fwd.py | 541 | # Do we need to check if we overshoot kBlockN when we load V? |
| LOW | flash_attn/cute/flash_fwd_combine.py | 405 | # Step 1: Load LSE_partial from gmem to shared memory |
| LOW | flash_attn/cute/flash_fwd_combine.py | 442 | # Step 2: Load O_partial for pipeline stages |
| LOW | flash_attn/cute/flash_fwd_combine.py | 495 | # Step 3: Load and transpose LSE from smem to registers |
| LOW | flash_attn/cute/flash_fwd_combine.py | 513 | # Step 4: Compute final LSE along split dimension |
| LOW | flash_attn/cute/flash_fwd_combine.py | 573 | # Step 5: Store final LSE to gmem |
| LOW | flash_attn/cute/flash_fwd_combine.py | 595 | # Step 6: Read O_partial and accumulate final O |
| LOW | flash_attn/cute/flash_fwd_combine.py | 642 | # Step 7: Write final O to gmem |
| Severity | File | Line | Snippet |
|---|---|---|---|
| HIGH | hopper/setup.py | 295 | blocks.append(cuda_compile_rule) # type: ignore[possibly-undefined] |
| HIGH | hopper/setup.py | 296 | blocks.append(cuda_compile_rule_sm80) # type: ignore[possibly-undefined] |
| HIGH | hopper/setup.py | 297 | blocks.append(cuda_compile_rule_sm80_sm90) # type: ignore[possibly-undefined] |
| HIGH | hopper/setup.py | 298 | blocks.append(cuda_compile_rule_sm100) # type: ignore[possibly-undefined] |
| HIGH | tests/cute/test_flash_attn_combine.py | 183 | # Only compare valid positions (beyond seqused, output is undefined) |
| HIGH | AI/parse_clc_log.py | 246 | let selectedSm = null; |
| HIGH | AI/parse_clc_log.py | 309 | if (id === query || id.includes(query)) {{ |
| HIGH | AI/parse_clc_log.py | 326 | selectedSm = null; |
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | hopper/generate_kernels.py | 134 | # so we should just pass in packgqa=False to avoid the `_packgqa` in the filename. |
| LOW | tests/cute/score_mod_definitions.py | 478 | # Don't read from aux_tensors at all - just add the global index as bias |
| MEDIUM | tests/cute/test_mask_mod.py | 6 | # (identity, document, block_diagonal, etc.) with comprehensive seqlen coverage |
| LOW | flash_attn/flash_attn_triton.py | 145 | # [2022-10-30] TD: Triton bug - in the case of EVEN_M=True and EVEN_N=False, if we just call |
| LOW | flash_attn/flash_attn_triton.py | 347 | # if we just call tl.store(dv_ptrs), there's a race condition |
| LOW | flash_attn/flash_attn_triton.py | 442 | # if we just call tl.load(k_ptrs), we get the wrong output! |
| LOW | flash_attn/flash_attn_triton.py | 521 | # [2022-11-01] TD: Triton bug, there's a race condition if we just use m_mask and not d_mask. |
| LOW | flash_attn/cute/flash_bwd_postprocess.py | 187 | # We can't just use kHeadDim here. E.g. if MMA shape is 64 x 96 but split across 2 WGs, |
| MEDIUM | flash_attn/cute/flash_fwd_sm100.py | 701 | # CLC buffers placed here to utilize padding before sO's 1024-byte alignment. |
| LOW | flash_attn/cute/flash_fwd_sm90.py | 1390 | # 2 elements. So we just call ptx directly. |
| LOW | flash_attn/cute/flash_fwd_sm90.py | 1462 | # 2 elements. So we just call ptx directly. |
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | setup.py | 186 | except Exception as e: |
| MEDIUM | setup.py | 180 | def detect_hipify_v2(): |
| LOW | tests/cute/benchmark_block_sparsity.py | 83 | except Exception as e: |
| LOW | tests/cute/benchmark_block_sparsity.py | 190 | except Exception as e: |
| LOW | tests/cute/benchmark_block_sparsity.py | 375 | except Exception as e: |
| LOW | benchmarks/tune_ex2_emu.py | 307 | except Exception as e: |
| LOW | benchmarks/tune_ex2_emu.py | 370 | except Exception as e: |
| LOW | benchmarks/bench_sm90.py | 126 | except Exception as e: |
| LOW | benchmarks/bench_sm90.py | 165 | except Exception as e: |
| LOW | benchmarks/bench_sm90.py | 175 | except Exception as e: |
| LOW | benchmarks/benchmark_flash_attention.py | 119 | except Exception: |
| LOW | benchmarks/benchmark_flash_attention.py | 134 | except Exception: |
| LOW | benchmarks/benchmark_flash_attention.py | 141 | except Exception: |
| LOW | flash_attn/cute/cute_dsl_ptxas.py | 93 | except Exception as e: |
| LOW | flash_attn/cute/benchmark_flash_attention_fp8.py | 330 | except Exception as e: |
| LOW | flash_attn/cute/benchmark_flash_attention_fp8.py | 402 | except Exception as e: |
| LOW | flash_attn/cute/utils.py | 77 | except Exception: |
| Severity | File | Line | Snippet |
|---|---|---|---|
| MEDIUM | hopper/generate_kernels.py | 3 | # This file is run to generate the kernel instantiations for the flash_attn kernels |
| MEDIUM | tests/cute/test_flash_attn_combine.py | 254 | # Create a permuted batch index mapping: virtual batch -> real batch |
| MEDIUM | tests/models/test_llama.py | 578 | # Create a shared test model. |
| MEDIUM | flash_attn/modules/embedding.py | 137 | # Create a mask of valid vocab ids (1 means it needs to be masked). |
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | setup.py | 463 | # Check if torch is using hipify v2. Until CK is updated with HIPIFY_V2 macro, |
| LOW | hopper/setup.py | 384 | # Set timeout to 300 seconds to prevent the request from hanging forever. |
| LOW | tests/cute/test_utils.py | 204 | # Set __cute_hash__ to simulate Inductor-generated code |
| LOW | tests/cute/test_block_sparsity.py | 111 | # Check if ref skipped it entirely (all masked) |
| LOW | flash_attn/cute/flash_bwd.py | 133 | # Check if block size setting is out of shared memory capacity |
| LOW | flash_attn/cute/compute_block_sparsity.py | 374 | # Check if mask_mod is marked as suitable for 5-point sampling |
| LOW | flash_attn/cute/flash_fwd.py | 158 | # Check if block size setting is out of shared memory capacity |
| LOW | flash_attn/cute/flash_fwd.py | 171 | # Check if twice the block size is divisible by the number of threads |
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | tools/ci/build_sif.sh | 4 | # Usage: |
| LOW | tests/cute/test_mask_mod_varlen.py | 8 | # Usage: |
| LOW | tests/cute/test_mask_mod.py | 10 | # Usage: |