SGLang is a high-performance serving framework for large language models and multimodal models.
17209 matches across 20 categories. Click a row to expand file-level details.
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | benchmark/dspy/bench_dspy_intro.py | 136 | def validate_context_and_answer(example, pred, trace=None): |
| LOW | …ing_window_attention_triton/bench_triton_swa_kernel.py | 11 | def extend_attention_fwd_torch( |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 122 | def cleanup_flashinfer_workspace(ipc_handles): |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 85 | def setup_flashinfer_workspace( |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 152 | def get_trtllm_fused_allreduce_kwargs(self): |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 162 | def flashinfer_fused_allreduce_rmsnorm( |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 287 | def standard_allreduce_rmsnorm( |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 397 | def standard_allreduce_rmsnorm_native( |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 478 | def standard_allreduce_rmsnorm_native_compiled( |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 894 | def prepare_results_with_speedups(results_dict): |
| LOW | …uler_batch/benchmark_write_req_to_token_pool_triton.py | 12 | def write_req_to_token_pool_triton( |
| LOW | …uler_batch/benchmark_write_req_to_token_pool_triton.py | 49 | def write_req_to_token_pool_triton_optimize( |
| LOW | …uler_batch/benchmark_write_req_to_token_pool_triton.py | 91 | def write_req_to_token_pool_reference( |
| LOW | …uler_batch/benchmark_write_req_to_token_pool_triton.py | 114 | def test_write_req_to_token_pool(): |
| LOW | benchmark/kernels/fused_moe_triton/common_utils.py | 23 | def calculate_shard_intermediate_size( |
| LOW | benchmark/kernels/fused_moe_triton/common_utils.py | 172 | def get_rocm_configs_compute_bound() -> List[Dict[str, int]]: |
| LOW | benchmark/kernels/fused_moe_triton/common_utils.py | 195 | def get_configs_compute_bound() -> List[Dict[str, int]]: |
| LOW | …pseek/benchmark_deepgemm_dsv3_router_gemm_blackwell.py | 137 | def get_benchmark_plot_friendly(tp_sizes): |
| LOW | …nels/deepseek/benchmark_deepgemm_fp8_gemm_blackwell.py | 231 | def get_benchmark_plot_friendly(tp_size): |
| LOW | …hmark/kernels/quantization/tuning_block_wise_kernel.py | 148 | def get_rocm_configs_compute_bound(): |
| LOW | …hmark/kernels/quantization/tuning_block_wise_kernel.py | 171 | def get_configs_compute_bound(): |
| LOW | …s/decoding_attention_triton/triton_flashinfer_cudnn.py | 98 | def decode_attention_flashinfer(dtype, head_num_q, head_num_kv): |
| LOW | benchmark/mmmu/bench_sglang.py | 127 | async def process_sample_with_semaphore( |
| LOW | benchmark/mmmu/eval_utils.py | 292 | def _parse_explicit_multi_choice_answer(response, all_choices): |
| LOW | benchmark/mmmu/eval_utils.py | 303 | def parse_multi_choice_response(response, all_choices, index2ans): |
| LOW | …hmark/bench_pynccl_allocator/bench_segment_tracking.py | 56 | def bench_register_segments_with_comm( |
| LOW | …hmark/bench_pynccl_allocator/bench_segment_tracking.py | 114 | def bench_with_various_segment_counts( |
| LOW | benchmark/scheduler/bench_token_storage.py | 84 | def _batch_tensor_from_pyarrays(parts: list[array]) -> torch.Tensor: |
| LOW | benchmark/scheduler/bench_token_storage.py | 256 | def microbench_torch_tensor_paths( |
| LOW | benchmark/hicache/bench_mix.py | 299 | async def async_request_sglang_generate( |
| LOW | benchmark/hicache/bench_warm_cache.py | 90 | def _create_bench_client_session() -> aiohttp.ClientSession: |
| LOW | benchmark/hicache/bench_warm_cache.py | 97 | async def async_request_sglang_generate( |
| LOW | benchmark/hicache/bench_warm_cache.py | 413 | def maybe_write_summary_jsonl( |
| LOW | benchmark/hicache/bench_warm_cache.py | 472 | async def benchmark_shared_prefix_pct( |
| LOW | benchmark/hicache/bench_serving.py | 71 | async def async_request_openai_completions( |
| LOW | benchmark/hicache/data_processing.py | 150 | def sample_ultrachat_requests( |
| LOW | benchmark/hicache/data_processing.py | 434 | def sample_generated_shared_prefix_requests( |
| LOW | benchmark/prefill_only/bench_score.py | 65 | def create_score_request_builder(): |
| LOW | benchmark/prefill_only/bench_score.py | 80 | def generate_text_with_token_count_local(num_toks): |
| LOW | benchmark/prefill_only/bench_score.py | 120 | def build_warmup_score_request() -> dict: |
| LOW | benchmark/prefill_only/util.py | 46 | def generate_text_with_token_count( |
| LOW | benchmark/prefill_only/util.py | 106 | def prepare_all_requests_parallel( |
| LOW | benchmark/prefill_only/util.py | 398 | async def perform_global_warmup_and_freeze( |
| LOW | benchmark/prefill_only/bench_embeddings.py | 100 | def validate_embeddings_response(response_data: dict) -> bool: |
| LOW | benchmark/prefill_only/bench_embeddings.py | 110 | def build_warmup_embeddings_request() -> dict: |
| LOW | …nchmark/bench_in_batch_prefix/bench_in_batch_prefix.py | 76 | def test_batch_by_batch_with_hint(all_prompts, gen_len): |
| LOW | benchmark/asr/bench_sglang.py | 55 | def run_asr_transcription_sync(client, model_name, y, sr, language=None): |
| LOW | benchmark/asr/bench_sglang.py | 75 | def run_asr_transcription_stream_sync( |
| LOW | benchmark/benchmark_batch/benchmark_tokenizer.py | 179 | def generate_random_token_ids(*, num_prompts, num_tokens, tokenizer): |
| LOW | benchmark/lora/lora_bench.py | 48 | async def async_request_openai_completions( |
| LOW | …ark/bench_linear_attention/bench_cutedsl_kda_decode.py | 155 | def run_prefill_then_decode_baseline(inp): |
| LOW | …ark/bench_linear_attention/bench_cutedsl_kda_decode.py | 191 | def run_prefill_then_decode_cutedsl(inp): |
| LOW | benchmark/generative_agents/agent_functions.py | 56 | def generate_event_triple_prompt(persona_name, action): |
| LOW | benchmark/generative_agents/agent_functions.py | 89 | def generate_pronunciatio_prompt(action): |
| LOW | benchmark/generative_agents/agent_functions.py | 158 | def action_location_sector_prompt( |
| LOW | benchmark/generative_agents/agent_functions.py | 260 | def action_location_object_prompt( |
| LOW | benchmark/llm_judge/bench_other.py | 52 | async def multi_dimension_judge_async(article, generate): |
| LOW | …l/sgl-router/tests/scripts/generate_parity_fixtures.py | 63 | def load_tokenizer_with_fallback(primary, fallback, slug): |
| LOW | experimental/sgl-router/tests/e2e/conftest.py | 169 | def build_smoke_router_config( |
| LOW | …sgl-router/tests/e2e/k8s_integration/test_lifecycle.py | 54 | def test_router_routes_after_scale_up(self, router_url): |
| 8944 more matches not shown… | |||
| Severity | File | Line | Snippet |
|---|---|---|---|
| MEDIUM | …hmark/kernels/quantization/tuning_block_wise_kernel.py | 13 | # ============================================================================== |
| MEDIUM | benchmark/kernels/lora_csgmv/tune_lora_csgmv.py | 128 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/kernels/lora_csgmv/tune_lora_csgmv.py | 130 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/kernels/lora_csgmv/tune_lora_csgmv.py | 181 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/kernels/lora_csgmv/tune_lora_csgmv.py | 183 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/kernels/lora_csgmv/tune_lora_csgmv.py | 333 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/kernels/lora_csgmv/tune_lora_csgmv.py | 335 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/kernels/lora_csgmv/tune_lora_csgmv.py | 390 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/kernels/lora_csgmv/tune_lora_csgmv.py | 392 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/lora/lora_bench.py | 13 | # ============================================================================== |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_decode.py | 39 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_decode.py | 41 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_decode.py | 89 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_decode.py | 91 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_decode.py | 155 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_decode.py | 157 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_decode.py | 206 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_decode.py | 208 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_decode.py | 299 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_decode.py | 301 | # --------------------------------------------------------------------------- |
| MEDIUM | …rk/bench_linear_attention/bench_gdn_prefill_cutedsl.py | 38 | # --------------------------------------------------------------------------- |
| MEDIUM | …rk/bench_linear_attention/bench_gdn_prefill_cutedsl.py | 40 | # --------------------------------------------------------------------------- |
| MEDIUM | …rk/bench_linear_attention/bench_gdn_prefill_cutedsl.py | 68 | # --------------------------------------------------------------------------- |
| MEDIUM | …rk/bench_linear_attention/bench_gdn_prefill_cutedsl.py | 70 | # --------------------------------------------------------------------------- |
| MEDIUM | …rk/bench_linear_attention/bench_gdn_prefill_cutedsl.py | 117 | # --------------------------------------------------------------------------- |
| MEDIUM | …rk/bench_linear_attention/bench_gdn_prefill_cutedsl.py | 119 | # --------------------------------------------------------------------------- |
| MEDIUM | …rk/bench_linear_attention/bench_gdn_prefill_cutedsl.py | 173 | # --------------------------------------------------------------------------- |
| MEDIUM | …rk/bench_linear_attention/bench_gdn_prefill_cutedsl.py | 175 | # --------------------------------------------------------------------------- |
| MEDIUM | …rk/bench_linear_attention/bench_gdn_prefill_cutedsl.py | 232 | # --------------------------------------------------------------------------- |
| MEDIUM | …rk/bench_linear_attention/bench_gdn_prefill_cutedsl.py | 234 | # --------------------------------------------------------------------------- |
| MEDIUM | …rk/bench_linear_attention/bench_gdn_prefill_cutedsl.py | 323 | # --------------------------------------------------------------------------- |
| MEDIUM | …rk/bench_linear_attention/bench_gdn_prefill_cutedsl.py | 325 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_prefill.py | 37 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_prefill.py | 39 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_prefill.py | 96 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_prefill.py | 98 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_prefill.py | 161 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_prefill.py | 163 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_prefill.py | 247 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_prefill.py | 249 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_prefill.py | 350 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_prefill.py | 352 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_prefill.py | 448 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_linear_attention/bench_gdn_prefill.py | 450 | # --------------------------------------------------------------------------- |
| MEDIUM | benchmark/bench_rope/benchmark_rope_index.py | 21 | # ----------------------------- |
| MEDIUM | benchmark/bench_rope/benchmark_rope_index.py | 23 | # ----------------------------- |
| MEDIUM | benchmark/bench_rope/benchmark_rope_index.py | 39 | # ----------------------------- |
| MEDIUM | benchmark/bench_rope/benchmark_rope_index.py | 41 | # ----------------------------- |
| MEDIUM | experimental/sgl-router/tests/e2e/conftest.py | 246 | # --------------------------------------------------------------------------- |
| MEDIUM | experimental/sgl-router/tests/e2e/conftest.py | 248 | # --------------------------------------------------------------------------- |
| MEDIUM | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 37 | # --------------------------------------------------------------------------- |
| MEDIUM | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 39 | # --------------------------------------------------------------------------- |
| MEDIUM | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 49 | # --------------------------------------------------------------------------- |
| MEDIUM | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 51 | # --------------------------------------------------------------------------- |
| MEDIUM | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 74 | # --------------------------------------------------------------------------- |
| MEDIUM | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 76 | # --------------------------------------------------------------------------- |
| MEDIUM | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 81 | # --------------------------------------------------------------------------- |
| MEDIUM | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 83 | # --------------------------------------------------------------------------- |
| MEDIUM | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 88 | # --------------------------------------------------------------------------- |
| MEDIUM | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 92 | # --------------------------------------------------------------------------- |
| 1429 more matches not shown… | |||
| Severity | File | Line | Snippet |
|---|---|---|---|
| HIGH | …hmark/kernels/quantization/tuning_block_wise_kernel.py | 0 | this function performs matrix multiplication with block-wise quantization. it takes two input tensors `a` and `b` with s |
| HIGH | python/sglang/srt/layers/quantization/int8_kernel.py | 0 | this function performs matrix multiplication with block-wise quantization. it takes two input tensors `a` and `b` with s |
| HIGH | python/sglang/srt/layers/quantization/fp8_kernel.py | 0 | this function performs matrix multiplication with block-wise quantization. it takes two input tensors `a` and `b` with s |
| HIGH | benchmark/tree_of_thought_v0/bench_sglang.py | 0 | please generate a high-level plan for solving the following question. as the first step, just say what method and idea y |
| HIGH | benchmark/tree_of_thought_v0/bench_other.py | 0 | please generate a high-level plan for solving the following question. as the first step, just say what method and idea y |
| HIGH | benchmark/tree_of_thought_deep/bench_sglang.py | 0 | please generate a high-level plan for solving the following question. as the first step, just say what method and idea y |
| HIGH | benchmark/tree_of_thought_deep/lmql_funcs.py | 0 | please generate a high-level plan for solving the following question. as the first step, just say what method and idea y |
| HIGH | benchmark/tree_of_thought_deep/bench_other.py | 0 | please generate a high-level plan for solving the following question. as the first step, just say what method and idea y |
| HIGH | benchmark/tree_of_thought_v0/bench_sglang.py | 0 | the plan looks good! now, use real numbers and do the calculation. please solve the question step-by-step according to t |
| HIGH | benchmark/tree_of_thought_v0/bench_other.py | 0 | the plan looks good! now, use real numbers and do the calculation. please solve the question step-by-step according to t |
| HIGH | benchmark/tree_of_thought_deep/bench_sglang.py | 0 | the plan looks good! now, use real numbers and do the calculation. please solve the question step-by-step according to t |
| HIGH | benchmark/tree_of_thought_deep/lmql_funcs.py | 0 | the plan looks good! now, use real numbers and do the calculation. please solve the question step-by-step according to t |
| HIGH | benchmark/tree_of_thought_deep/bench_other.py | 0 | the plan looks good! now, use real numbers and do the calculation. please solve the question step-by-step according to t |
| HIGH | benchmark/tip_suggestion/bench_sglang.py | 0 | please expand a tip for a topic into a detailed paragraph. topic: staying healthy tip: regular exercise paragraph: incor |
| HIGH | benchmark/tip_suggestion/lmql_funcs.py | 0 | please expand a tip for a topic into a detailed paragraph. topic: staying healthy tip: regular exercise paragraph: incor |
| HIGH | benchmark/tip_suggestion/bench_other.py | 0 | please expand a tip for a topic into a detailed paragraph. topic: staying healthy tip: regular exercise paragraph: incor |
| HIGH | benchmark/json_jump_forward/bench_sglang.py | 0 | "house": "(gryffindor|slytherin|ravenclaw|hufflepuff)",\n |
| HIGH | benchmark/json_jump_forward/bench_other.py | 0 | "house": "(gryffindor|slytherin|ravenclaw|hufflepuff)",\n |
| HIGH | examples/frontend_language/usage/json_decode.py | 0 | "house": "(gryffindor|slytherin|ravenclaw|hufflepuff)",\n |
| HIGH | benchmark/json_jump_forward/bench_sglang.py | 0 | "blood status": "(pure-blood|half-blood|muggle-born)",\n |
| HIGH | benchmark/json_jump_forward/bench_other.py | 0 | "blood status": "(pure-blood|half-blood|muggle-born)",\n |
| HIGH | examples/frontend_language/usage/json_decode.py | 0 | "blood status": "(pure-blood|half-blood|muggle-born)",\n |
| HIGH | benchmark/json_jump_forward/bench_sglang.py | 0 | "occupation": "(student|teacher|auror|ministry of magic|death eater|order of the phoenix)",\n |
| HIGH | benchmark/json_jump_forward/bench_other.py | 0 | "occupation": "(student|teacher|auror|ministry of magic|death eater|order of the phoenix)",\n |
| HIGH | examples/frontend_language/usage/json_decode.py | 0 | "occupation": "(student|teacher|auror|ministry of magic|death eater|order of the phoenix)",\n |
| HIGH | benchmark/tree_of_thought_deep/bench_sglang.py | 0 | okay. now, evaluate your own solution and give it a score on a scale of 1 to 5. please do rigorous check of the correctn |
| HIGH | benchmark/tree_of_thought_deep/lmql_funcs.py | 0 | okay. now, evaluate your own solution and give it a score on a scale of 1 to 5. please do rigorous check of the correctn |
| HIGH | benchmark/tree_of_thought_deep/bench_other.py | 0 | okay. now, evaluate your own solution and give it a score on a scale of 1 to 5. please do rigorous check of the correctn |
| HIGH | benchmark/tree_of_thought_deep/bench_sglang.py | 0 | based on your reflection, do you change your mind? now, give me the final answer after careful consideration. |
| HIGH | benchmark/tree_of_thought_deep/lmql_funcs.py | 0 | based on your reflection, do you change your mind? now, give me the final answer after careful consideration. |
| HIGH | benchmark/tree_of_thought_deep/bench_other.py | 0 | based on your reflection, do you change your mind? now, give me the final answer after careful consideration. |
| HIGH | test/srt/cpu/test_qwen3.py | 0 | derives `query`, `key` and `value` tensors from `mixed_qkvzba`. |
| HIGH | test/registered/cpu/test_qwen3.py | 0 | derives `query`, `key` and `value` tensors from `mixed_qkvzba`. |
| HIGH | python/sglang/srt/models/qwen3_5.py | 0 | derives `query`, `key` and `value` tensors from `mixed_qkvzba`. |
| HIGH | python/sglang/srt/models/qwen3_next.py | 0 | derives `query`, `key` and `value` tensors from `mixed_qkvzba`. |
| HIGH | test/srt/cpu/utils.py | 0 | matrix multiplication function that supports per-token input quantization and per-column weight quantization |
| HIGH | test/registered/quant/test_int8_kernel.py | 0 | matrix multiplication function that supports per-token input quantization and per-column weight quantization |
| HIGH | test/registered/cpu/utils.py | 0 | matrix multiplication function that supports per-token input quantization and per-column weight quantization |
| HIGH | …t/registered/moe/test_triton_moe_channel_fp8_kernel.py | 0 | matrix multiplication function that supports per-token input quantization and per-column weight quantization |
| HIGH | test/srt/cpu/utils.py | 0 | this function performs fused moe with per-column int8 quantization using native torch. |
| HIGH | test/registered/quant/test_int8_kernel.py | 0 | this function performs fused moe with per-column int8 quantization using native torch. |
| HIGH | test/registered/cpu/utils.py | 0 | this function performs fused moe with per-column int8 quantization using native torch. |
| HIGH | …t/registered/moe/test_triton_moe_channel_fp8_kernel.py | 0 | this function performs fused moe with per-column int8 quantization using native torch. |
| HIGH | test/srt/cpu/test_causal_conv1d.py | 0 | x: (batch, dim, seqlen) weight: (dim, width) bias: (dim,) initial_states: (batch, dim, width - 1) final_states_out: (bat |
| HIGH | test/registered/layers/mamba/test_causal_conv1d.py | 0 | x: (batch, dim, seqlen) weight: (dim, width) bias: (dim,) initial_states: (batch, dim, width - 1) final_states_out: (bat |
| HIGH | test/registered/cpu/test_causal_conv1d.py | 0 | x: (batch, dim, seqlen) weight: (dim, width) bias: (dim,) initial_states: (batch, dim, width - 1) final_states_out: (bat |
| HIGH | sgl-kernel/tests/test_causal_conv1d.py | 0 | x: (batch, dim, seqlen) weight: (dim, width) bias: (dim,) initial_states: (batch, dim, width - 1) final_states_out: (bat |
| HIGH | test/srt/cpu/test_causal_conv1d.py | 0 | x: (batch, dim) or (batch, dim, seqlen) conv_state: (batch, dim, state_len), where state_len >= width - 1 weight: (dim, |
| HIGH | test/registered/layers/mamba/test_causal_conv1d.py | 0 | x: (batch, dim) or (batch, dim, seqlen) conv_state: (batch, dim, state_len), where state_len >= width - 1 weight: (dim, |
| HIGH | test/registered/cpu/test_causal_conv1d.py | 0 | x: (batch, dim) or (batch, dim, seqlen) conv_state: (batch, dim, state_len), where state_len >= width - 1 weight: (dim, |
| HIGH | sgl-kernel/tests/test_causal_conv1d.py | 0 | x: (batch, dim) or (batch, dim, seqlen) conv_state: (batch, dim, state_len), where state_len >= width - 1 weight: (dim, |
| HIGH | test/manual/quant/test_block_fp8.py | 0 | this function performs matrix multiplication with block-wise quantization using native torch. it takes two input tensors |
| HIGH | test/manual/quant/test_block_fp8_deep_gemm_blackwell.py | 0 | this function performs matrix multiplication with block-wise quantization using native torch. it takes two input tensors |
| HIGH | test/registered/quant/test_block_int8.py | 0 | this function performs matrix multiplication with block-wise quantization using native torch. it takes two input tensors |
| HIGH | test/manual/nightly/test_vlms_vit_cuda_graph.py | 0 | evaluate a vlm on the mmmu validation set with lmms‑eval. only `model_version` (checkpoint) and `chat_template` vary; we |
| HIGH | test/manual/nightly/test_vlms_vit_flashinfer_cudnn.py | 0 | evaluate a vlm on the mmmu validation set with lmms‑eval. only `model_version` (checkpoint) and `chat_template` vary; we |
| HIGH | test/manual/nightly/test_vlms_piecewise_cuda_graph.py | 0 | evaluate a vlm on the mmmu validation set with lmms‑eval. only `model_version` (checkpoint) and `chat_template` vary; we |
| HIGH | python/sglang/test/ascend/vlm_utils.py | 0 | evaluate a vlm on the mmmu validation set with lmms‑eval. only `model_version` (checkpoint) and `chat_template` vary; we |
| HIGH | test/manual/nightly/test_vlms_vit_cuda_graph.py | 0 | common method to run vlm mmmu benchmark test. args: model: model to test output_path: path for output logs test_name: op |
| HIGH | test/manual/nightly/test_vlms_vit_flashinfer_cudnn.py | 0 | common method to run vlm mmmu benchmark test. args: model: model to test output_path: path for output logs test_name: op |
| 364 more matches not shown… | |||
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | benchmark/scheduler/bench_token_storage.py | 25 | |
| LOW | benchmark/hicache/perf.py | 1 | |
| LOW | …router/tests/scripts/generate_kv_events_hash_parity.py | 48 | |
| LOW | …perimental/sgl-router/tests/e2e/test_tokenize_smoke.py | 5 | |
| LOW | experimental/sgl-router/tests/e2e/conftest.py | 21 | |
| LOW | experimental/sgl-router/tests/e2e/test_chat_smoke.py | 5 | |
| LOW | experimental/sgl-router/tests/e2e/test_chat_smoke.py | 8 | |
| LOW | experimental/sgl-router/tests/e2e/infra/gateway.py | 26 | |
| LOW | experimental/sgl-router/tests/e2e/infra/model_specs.py | 16 | |
| LOW | experimental/sgl-router/tests/e2e/infra/model_pool.py | 19 | |
| LOW | …al/sgl-router/tests/e2e/k8s_integration/fake_worker.py | 10 | |
| LOW | …sgl-router/tests/e2e/k8s_integration/test_lifecycle.py | 14 | |
| LOW | …sgl-router/tests/e2e/k8s_integration/test_lifecycle.py | 16 | |
| LOW | …sgl-router/tests/e2e/k8s_integration/test_lifecycle.py | 19 | |
| LOW | …sgl-router/tests/e2e/k8s_integration/test_lifecycle.py | 20 | |
| LOW | …ental/sgl-router/tests/e2e/k8s_integration/conftest.py | 12 | |
| LOW | …sgl-router/tests/e2e/k8s_integration/test_discovery.py | 8 | |
| LOW | …sgl-router/tests/e2e/k8s_integration/test_discovery.py | 11 | |
| LOW | …sgl-router/tests/e2e/k8s_integration/test_discovery.py | 12 | |
| LOW | …uter/tests/e2e/k8s_integration/test_cross_namespace.py | 15 | |
| LOW | …uter/tests/e2e/k8s_integration/test_cross_namespace.py | 18 | |
| LOW | …outer/tests/e2e/k8s_integration/test_reconciliation.py | 16 | |
| LOW | …outer/tests/e2e/k8s_integration/test_reconciliation.py | 18 | |
| LOW | …l-router/tests/e2e/chat_completions/test_validation.py | 14 | |
| LOW | …ts/e2e/chat_completions/test_two_router_convergence.py | 38 | |
| LOW | test/manual/test_ray_engine.py | 25 | |
| LOW | test/manual/quant/test_block_fp8.py | 679 | |
| LOW | test/manual/layers/moe/bench_mxfp4_sm90_kernels.py | 21 | |
| LOW | test/manual/hicache/test_pp_with_hicache.py | 79 | |
| LOW | …ion/unittests/dual_chunk/test_dual_chunk_flash_attn.py | 48 | |
| LOW | …ed/unit/layers/quantization/test_mxfp4_sm90_cutlass.py | 14 | |
| LOW | …ed/unit/layers/quantization/test_mxfp4_sm90_cutlass.py | 174 | |
| LOW | …d/unit/hardware_backend/mlx/test_attention_patching.py | 3 | |
| LOW | …istered/unit/hardware_backend/mlx/test_quantization.py | 13 | |
| LOW | test/registered/unit/distributed/test_parallel_state.py | 37 | |
| LOW | …egistered/unit/utils/test_hf_transformers_fastokens.py | 19 | |
| LOW | …t/registered/unit/model_loader/test_modelopt_export.py | 29 | |
| LOW | …saggregation/test_disaggregation_decode_radix_cache.py | 24 | |
| LOW | …saggregation/test_disaggregation_decode_radix_cache.py | 32 | |
| LOW | test/registered/layers/test_fla_layernorm_guard.py | 1 | |
| LOW | …gistered/models_e2e/test_deepseek_v4_flash_fp4_h200.py | 28 | |
| LOW | …st/registered/openai_server/basic/test_http2_server.py | 24 | |
| LOW | test/registered/prefill_only/test_serving_rerank.py | 134 | |
| LOW | test/registered/observability/test_tracing.py | 57 | |
| LOW | test/registered/vlm/test_vision_openai_server_a.py | 12 | |
| LOW | test/registered/amd/test_wan22_fp8_mla.py | 3 | |
| LOW | …t/registered/debug_utils/comparator/testing_helpers.py | 3 | |
| LOW | …gistered/debug_utils/comparator/test_meta_overrider.py | 3 | |
| LOW | test/registered/debug_utils/comparator/test_e2e_demo.py | 9 | |
| LOW | …tils/comparator/aligner/token_aligner/test_executor.py | 1 | |
| LOW | …ered/breakable_cuda_graph/test_breakable_cuda_graph.py | 37 | |
| LOW | …ered/breakable_cuda_graph/test_breakable_cuda_graph.py | 52 | |
| LOW | …ered/breakable_cuda_graph/test_breakable_cuda_graph.py | 198 | |
| LOW | …ered/breakable_cuda_graph/test_breakable_cuda_graph.py | 260 | |
| LOW | python/sglang/kernel_api_logging.py | 7 | |
| LOW | python/sglang/__init__.py | 35 | |
| LOW | python/sglang/__init__.py | 36 | |
| LOW | python/sglang/__init__.py | 36 | |
| LOW | python/sglang/__init__.py | 36 | |
| LOW | python/sglang/__init__.py | 36 | |
| 1758 more matches not shown… | |||
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | benchmark/bench_adaptive_speculative.py | 70 | except Exception as e: |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 117 | except Exception as e: |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 130 | except Exception as e: |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 47 | except Exception: # pragma: no cover - fallback on non-supported platforms |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 650 | except Exception as e: |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 664 | except Exception as e: |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 683 | except Exception as e: |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 700 | except Exception as e: |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 720 | except Exception as e: |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 737 | except Exception as e: |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 762 | except Exception as e: |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 787 | except Exception as e: |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 811 | except Exception as e: |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 828 | except Exception as e: |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 854 | except Exception as e: |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 882 | except Exception as e: |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 1083 | except Exception as e: |
| LOW | benchmark/kernels/all_reduce/benchmark_aiter.py | 227 | except Exception as e: |
| LOW | benchmark/kernels/all_reduce/benchmark_aiter.py | 237 | except Exception as e: |
| LOW | benchmark/kernels/all_reduce/benchmark_aiter.py | 253 | except Exception as e: |
| LOW | benchmark/kernels/all_reduce/benchmark_aiter.py | 265 | except Exception as e: |
| LOW | benchmark/kernels/all_reduce/benchmark_aiter.py | 305 | except Exception: |
| LOW | benchmark/kernels/all_reduce/benchmark_all_reduce.py | 245 | except Exception as e: |
| LOW | benchmark/kernels/all_reduce/benchmark_all_reduce.py | 255 | except Exception as e: |
| LOW | benchmark/kernels/all_reduce/benchmark_all_reduce.py | 274 | except Exception as e: |
| LOW | benchmark/kernels/all_reduce/benchmark_all_reduce.py | 284 | except Exception as e: |
| LOW | benchmark/kernels/all_reduce/benchmark_all_reduce.py | 324 | except Exception: |
| LOW | benchmark/kernels/quantization/bench_fp4_quant.py | 114 | except Exception as e: |
| LOW | benchmark/kernels/lora_csgmv/tune_lora_csgmv.py | 228 | except Exception: |
| LOW | benchmark/kernels/lora_csgmv/tune_lora_csgmv.py | 305 | except Exception: |
| LOW | benchmark/mmmu/bench_sglang.py | 64 | except Exception: |
| LOW | benchmark/mmmu/bench_hf.py | 35 | except Exception as first_exception: |
| LOW | benchmark/mmmu/bench_hf.py | 60 | except Exception as second_exception: |
| LOW | benchmark/mmmu/eval_utils.py | 195 | except Exception as exc: |
| MEDIUM | benchmark/mmmu/eval_utils.py | 514 | print(f"Error to extract answer from: {pred_i}") |
| LOW | benchmark/json_schema/bench_sglang.py | 104 | except Exception as e: |
| MEDIUM | …hmark/bench_pynccl_allocator/bench_segment_tracking.py | 195 | print("Error: CUDA is not available. This benchmark requires a GPU.") |
| LOW | benchmark/hicache/bench_mix.py | 362 | except Exception as e: |
| LOW | benchmark/hicache/bench_mix.py | 423 | except Exception as e: |
| MEDIUM | benchmark/hicache/bench_mix.py | 479 | print(f"Error processing response for client {user_data}: {e}") |
| MEDIUM | benchmark/hicache/bench_mix.py | 417 | def handle_request(self, user_data): |
| LOW | benchmark/hicache/bench_warm_cache.py | 169 | except Exception as exc: |
| LOW | benchmark/hicache/bench_serving.py | 194 | except Exception: |
| LOW | benchmark/hicache/bench_serving.py | 214 | except Exception: |
| LOW | benchmark/hicache/bench_serving.py | 273 | except Exception as e: |
| MEDIUM | benchmark/hicache/bench_serving.py | 632 | print(f"Error running benchmark for request rate: {request_rate}") |
| LOW | benchmark/hicache/bench_serving.py | 739 | except Exception as e: |
| MEDIUM | benchmark/hicache/bench_multiturn.py | 180 | print(f"Error writing to JSONL file: {e}") |
| LOW | benchmark/hicache/bench_multiturn.py | 395 | except Exception as e: |
| MEDIUM | benchmark/hicache/bench_multiturn.py | 543 | print(f"Error processing response for client {client_id}: {e}") |
| LOW | benchmark/hicache/bench_multiturn.py | 552 | except Exception as e: |
| LOW | benchmark/prefill_only/bench_score.py | 108 | except Exception as e: |
| MEDIUM | benchmark/prefill_only/bench_score.py | 109 | print(f"Error building request {index}: {e}") |
| LOW | benchmark/prefill_only/util.py | 131 | except Exception as e: |
| MEDIUM | benchmark/prefill_only/util.py | 132 | print(f"Error building request {index}: {e}") |
| LOW | benchmark/prefill_only/util.py | 160 | except Exception as e: |
| MEDIUM | benchmark/prefill_only/util.py | 161 | print(f"Error processing request result: {e}") |
| LOW | benchmark/prefill_only/util.py | 278 | except Exception as e: |
| LOW | benchmark/prefill_only/util.py | 327 | except Exception as e: |
| MEDIUM | benchmark/prefill_only/util.py | 328 | print(f"Error sending {profile_text} request: {e}") |
| 1356 more matches not shown… | |||
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 605 | |
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 894 | |
| LOW | …hmark/kernels/all_reduce/benchmark_fused_ar_rms_amd.py | 367 | |
| LOW | …/fused_moe_triton/benchmark_torch_compile_fused_moe.py | 16 | |
| LOW | benchmark/kernels/fused_moe_triton/common_utils.py | 32 | |
| LOW | benchmark/kernels/fused_moe_triton/common_utils.py | 172 | |
| LOW | benchmark/kernels/fused_moe_triton/common_utils.py | 195 | |
| LOW | …hmark/kernels/quantization/tuning_block_wise_kernel.py | 148 | |
| LOW | …hmark/kernels/quantization/tuning_block_wise_kernel.py | 171 | |
| LOW | benchmark/kernels/lora_csgmv/tune_lora_csgmv.py | 133 | |
| LOW | benchmark/kernels/lora_csgmv/tune_lora_csgmv.py | 151 | |
| LOW | benchmark/kernels/deepep/deepep_utils.py | 157 | |
| LOW | benchmark/kernels/deepep/tuning_deepep.py | 30 | |
| LOW | …s/decoding_attention_triton/triton_flashinfer_cudnn.py | 159 | |
| LOW | benchmark/mmmu/eval_utils.py | 154 | |
| LOW | benchmark/mmmu/eval_utils.py | 303 | |
| LOW | benchmark/mmmu/eval_utils.py | 423 | |
| LOW | benchmark/mmmu/eval_utils.py | 528 | |
| LOW | benchmark/mmmu/eval_utils.py | 430 | |
| LOW | benchmark/json_schema/bench_sglang.py | 29 | |
| LOW | benchmark/line_retrieval/bench_sglang.py | 30 | |
| LOW | benchmark/hicache/bench_mix.py | 299 | |
| LOW | benchmark/hicache/nextqa.py | 9 | |
| LOW | benchmark/hicache/bench_warm_cache.py | 97 | |
| LOW | benchmark/hicache/bench_serving.py | 71 | |
| LOW | benchmark/hicache/bench_serving.py | 286 | |
| LOW | benchmark/hicache/data_processing.py | 37 | |
| LOW | benchmark/hicache/data_processing.py | 198 | |
| LOW | benchmark/hicache/data_processing.py | 267 | |
| LOW | benchmark/hicache/data_processing.py | 520 | |
| LOW | benchmark/hicache/bench_multiturn.py | 435 | |
| LOW | benchmark/hicache/bench_multiturn.py | 555 | |
| LOW | benchmark/prefill_only/util.py | 106 | |
| LOW | benchmark/prefill_only/util.py | 284 | |
| LOW | benchmark/reasoning_benchmark/answer_extraction.py | 8 | |
| LOW | benchmark/reasoning_benchmark/answer_extraction.py | 182 | |
| LOW | benchmark/reasoning_benchmark/answer_extraction.py | 214 | |
| LOW | benchmark/reasoning_benchmark/eval_utils.py | 59 | |
| LOW | benchmark/asr/bench_sglang.py | 75 | |
| LOW | benchmark/json_jump_forward/bench_other.py | 132 | |
| LOW | benchmark/lora/lora_bench.py | 48 | |
| LOW | benchmark/bench_linear_attention/bench_gdn_decode.py | 359 | |
| LOW | benchmark/bench_linear_attention/bench_gdn_prefill.py | 520 | |
| LOW | experimental/sgl-router/tests/e2e/infra/gateway.py | 237 | |
| LOW | experimental/sgl-router/tests/e2e/infra/gateway.py | 378 | |
| LOW | experimental/sgl-router/tests/e2e/infra/model_pool.py | 106 | |
| LOW | …outer/tests/e2e/k8s_integration/test_reconciliation.py | 85 | |
| LOW | …l-router/tests/e2e/chat_completions/test_validation.py | 62 | |
| LOW | test/srt/cpu/test_decode.py | 264 | |
| LOW | test/srt/cpu/test_topk.py | 119 | |
| LOW | test/manual/test_quick_allreduce.py | 106 | |
| LOW | test/manual/test_quick_allreduce.py | 274 | |
| LOW | test/manual/test_triton_attention_rocm_mla.py | 224 | |
| LOW | test/manual/test_logprobs.py | 99 | |
| LOW | test/manual/test_logprobs.py | 302 | |
| LOW | test/manual/test_triton_moe_wna16.py | 16 | |
| LOW | test/manual/test_mscclpp.py | 106 | |
| LOW | test/manual/test_kv_events.py | 28 | |
| LOW | test/manual/test_kv_events.py | 164 | |
| LOW | test/manual/test_kv_events.py | 292 | |
| 1403 more matches not shown… | |||
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | …hmark/kernels/quantization/tuning_block_wise_kernel.py | 1 | # Copyright 2025 SGLang Team |
| LOW | benchmark/lora/lora_bench.py | 1 | # Copyright 2023-2024 SGLang Team |
| LOW | …ental/sgl-router/tests/proxy/pd_bootstrap_injection.rs | 1 | // SPDX-FileCopyrightText: Copyright (c) 2026 The SGLang Authors |
| LOW | …perimental/sgl-router/tests/proxy/graceful_shutdown.rs | 1 | // SPDX-FileCopyrightText: Copyright (c) 2026 The SGLang Authors |
| LOW | …perimental/sgl-router/tests/proxy/pd_pool_isolation.rs | 1 | // SPDX-FileCopyrightText: Copyright (c) 2026 The SGLang Authors |
| LOW | experimental/sgl-router/tests/proxy/chat_routing.rs | 701 | .await; |
| LOW | experimental/sgl-router/tests/proxy/chat_routing.rs | 1201 | // Wait long enough for the SSE pump to notice the receiver-drop and |
| LOW | experimental/sgl-router/tests/proxy/common/streaming.rs | 21 | use http_body_util::BodyExt; |
| LOW | …imental/sgl-router/tests/component/tokenizer/parity.rs | 1 | // SPDX-FileCopyrightText: Copyright (c) 2026 The SGLang Authors |
| LOW | …sgl-router/tests/component/policies/cache_aware_zmq.rs | 1 | // SPDX-FileCopyrightText: Copyright (c) 2026 The SGLang Authors |
| LOW | …/tests/component/policies/kv_events_two_subscribers.rs | 1 | // SPDX-FileCopyrightText: Copyright (c) 2026 The SGLang Authors |
| LOW | …/tests/component/policies/kv_events_two_subscribers.rs | 141 | "router_b shutdown after router_a drained took {elapsed:?}; \ |
| LOW | …/tests/component/policies/kv_events_tree_concurrent.rs | 1 | // SPDX-FileCopyrightText: Copyright (c) 2026 The SGLang Authors |
| LOW | …uter/tests/component/policies/kv_events_hash_parity.rs | 1 | // SPDX-FileCopyrightText: Copyright (c) 2026 The SGLang Authors |
| LOW | …sgl-router/tests/component/workers/concurrent_state.rs | 81 | let snapshot = r.workers_for(&model); |
| LOW | experimental/sgl-router/tests/e2e/conftest.py | 41 | # Mirrors SMG's e2e_test/conftest.py sys.path setup. |
| LOW | experimental/sgl-router/tests/e2e/requirements.txt | 1 | httpx==0.27.2 |
| LOW | experimental/sgl-router/benches/tree_lookup.rs | 1 | // SPDX-FileCopyrightText: Copyright (c) 2026 The SGLang Authors |
| LOW | experimental/sgl-router/src/proxy/sse.rs | 21 | /// |
| LOW | experimental/sgl-router/src/proxy/sse.rs | 41 | /// The opaque `Box<dyn Send + 'static>` accepts any drop-only payload — most |
| LOW | experimental/sgl-router/src/config/types.rs | 61 | stale_request_timeout_secs: default_stale_request_timeout_secs(), |
| LOW | experimental/sgl-router/src/config/types.rs | 141 | |
| LOW | experimental/sgl-router/src/config/types.rs | 201 | } |
| LOW | experimental/sgl-router/src/config/types.rs | 321 | /// Two operating modes, distinguished by which selector fields are set: |
| LOW | experimental/sgl-router/src/config/types.rs | 341 | /// `WorkerMode` and `bootstrap_port` for each worker are filled in by |
| LOW | experimental/sgl-router/src/config/types.rs | 361 | /// |
| LOW | experimental/sgl-router/src/config/types.rs | 421 | } |
| LOW | experimental/sgl-router/src/config/types.rs | 501 | // watcher (`watcher::Config::default().labels(&label)` |
| LOW | experimental/sgl-router/src/tokenizer/mod.rs | 101 | // tiny BPE fixture is byte-level and lossless for ASCII. |
| LOW | experimental/sgl-router/src/tokenizer/mod.rs | 141 | /// registry's `Arc<Tokenizer>` is `Send + Sync` and that |
| LOW | experimental/sgl-router/src/health/circuit_breaker.rs | 61 | config, |
| LOW | experimental/sgl-router/src/discovery/types.rs | 41 | /// new worker becomes available, and [`DiscoveryEvent::Removed`] when it |
| LOW | experimental/sgl-router/src/discovery/types.rs | 61 | } |
| LOW | experimental/sgl-router/src/discovery/k8s.rs | 81 | /// |
| LOW | experimental/sgl-router/src/discovery/k8s.rs | 141 | }); |
| LOW | experimental/sgl-router/src/discovery/k8s.rs | 221 | Ok(()) |
| LOW | experimental/sgl-router/src/discovery/k8s.rs | 741 | |
| LOW | experimental/sgl-router/src/discovery/static_urls.rs | 1 | // SPDX-FileCopyrightText: Copyright (c) 2026 The SGLang Authors |
| LOW | experimental/sgl-router/src/server/error.rs | 21 | /// build error). `source` captures the full anyhow chain for server-side |
| LOW | experimental/sgl-router/src/server/error.rs | 41 | UpstreamStatus { status: StatusCode }, |
| LOW | experimental/sgl-router/src/server/error.rs | 61 | /// independently of full-model outages. |
| LOW | experimental/sgl-router/src/server/error.rs | 81 | #[error("stale request expired for model {model}")] |
| LOW | experimental/sgl-router/src/server/metrics.rs | 1 | // SPDX-FileCopyrightText: Copyright (c) 2026 The SGLang Authors |
| LOW | experimental/sgl-router/src/server/metrics.rs | 21 | //! | `sgl_router_requests_total` | Counter | `worker_url`, `model_id`, `mode`, `outcome` | |
| LOW | experimental/sgl-router/src/server/routes/chat.rs | 21 | /// affinity for a PD-disaggregated request. The router fans the |
| LOW | experimental/sgl-router/src/server/routes/chat.rs | 41 | /// for normal chat traffic (a 200 k-token context tokenized as JSON is well |
| LOW | experimental/sgl-router/src/server/routes/chat.rs | 181 | // ends, the client disconnects, or the handler returns an error. In |
| LOW | experimental/sgl-router/src/server/routes/chat.rs | 201 | // Snapshot the labels we need for metrics BEFORE moving the worker |
| LOW | experimental/sgl-router/src/server/routes/chat.rs | 221 | // |
| LOW | experimental/sgl-router/src/server/routes/chat.rs | 341 | ); |
| LOW | experimental/sgl-router/src/server/routes/chat.rs | 421 | } |
| LOW | experimental/sgl-router/src/server/routes/chat.rs | 441 | (body.len() / CHARS_PER_TOKEN_ESTIMATE).max(1) |
| LOW | experimental/sgl-router/src/server/routes/chat.rs | 461 | /// to this address for the KV transfer. |
| LOW | experimental/sgl-router/src/server/routes/chat.rs | 501 | let bytes = serde_json::to_vec(&obj).map_err(|e| { |
| LOW | experimental/sgl-router/src/server/routes/chat.rs | 641 | } |
| LOW | experimental/sgl-router/src/policies/registry.rs | 1 | // SPDX-FileCopyrightText: Copyright (c) 2026 The SGLang Authors |
| LOW | experimental/sgl-router/src/policies/registry.rs | 21 | //! |
| LOW | experimental/sgl-router/src/policies/registry.rs | 41 | /// decode peer is considered "too hot" — we fall back to the lowest-load |
| LOW | experimental/sgl-router/src/policies/registry.rs | 61 | /// empty pool as a transient failure, not as "zero work". |
| LOW | experimental/sgl-router/src/policies/registry.rs | 101 | /// Returns `Err(NoHealthyWorkers)` only when the model has zero |
| 735 more matches not shown… | |||
| Severity | File | Line | Snippet |
|---|---|---|---|
| MEDIUM | benchmark/dspy/bench_dspy_intro.py | 95 | # Define the predictor. |
| MEDIUM | benchmark/dspy/bench_dspy_intro.py | 107 | # Define the predictor. Notice we're just changing the class. The signature BasicQA is unchanged. |
| MEDIUM | benchmark/mmmu/data_utils.py | 157 | # Create a JSON object with the filename as the key and caption as the value |
| MEDIUM | benchmark/hicache/bench_mix.py | 24 | # Create a lock for thread-safe debug log writing |
| MEDIUM | benchmark/prefill_only/bench_score.py | 157 | # Create the request builder function with shared tokenizer |
| MEDIUM | benchmark/prefill_only/util.py | 173 | # Create the full request list by cycling through unique requests |
| MEDIUM | benchmark/multi_turn_chat/long_prompt_multi_turn.py | 29 | # Create a unique cache filename based on the arguments that affect generation |
| MEDIUM | experimental/sgl-router/tests/e2e/infra/gateway.py | 45 | # This file is at `experimental/sgl-router/tests/e2e/infra/gateway.py`, |
| MEDIUM | test/srt/cpu/test_mamba.py | 14 | """This function is intended to align with the l2norm implementation in the FLA library.""" |
| MEDIUM | test/manual/test_weight_validation.py | 75 | # Create a minimal valid safetensors file |
| MEDIUM | test/manual/test_weight_validation.py | 166 | # Create a broken symlink for the index file |
| MEDIUM | test/manual/test_modelopt_fp8kvcache.py | 15 | # Create a ModelOptFp8Config object |
| MEDIUM | test/manual/test_config_integration.py | 27 | # Create a temporary config file |
| MEDIUM | test/manual/test_config_integration.py | 82 | # Create a temporary config file |
| MEDIUM | test/manual/test_config_integration.py | 113 | # Create a temporary config file |
| MEDIUM | test/manual/test_async_dynamic_batch_tokenizer.py | 54 | # Create a proper BatchEncoding-like object that supports dict operations |
| MEDIUM | test/manual/test_async_dynamic_batch_tokenizer.py | 193 | # Create a new async tokenizer with a failing tokenizer |
| MEDIUM | test/manual/attention/test_flashattn_backend.py | 66 | # Create a large enough req_to_token_pool to fit the test usage. |
| MEDIUM | test/manual/attention/test_flashattn_mla_backend.py | 54 | # Create a proper req_to_token_pool with the req_to_token attribute |
| MEDIUM | …ual/lang_frontend/test_separate_reasoning_execution.py | 72 | # Create a mock backend to avoid AttributeError in __del__ |
| MEDIUM | …ual/lang_frontend/test_separate_reasoning_execution.py | 75 | # Create a StreamExecutor with necessary setup |
| MEDIUM | …ual/lang_frontend/test_separate_reasoning_execution.py | 106 | # Create a gen expression and a separate_reasoning expression |
| MEDIUM | test/manual/debug_utils/run_with_retry.py | 37 | # Create a TestFile with a reasonable estimated time |
| MEDIUM | test/manual/eval/test_longbench_v2_eval.py | 73 | # Create a temporary JSON file with sample data |
| MEDIUM | test/registered/quant/test_quant_config_parsing.py | 23 | # Create a raw instance |
| MEDIUM | test/registered/kernels/test_dsa_indexer.py | 78 | # Create a simple page table for testing |
| MEDIUM | test/registered/kernels/test_dsa_indexer.py | 94 | # Create a simple page table for testing with page size 1 |
| MEDIUM | …st/registered/attention/test_wave_attention_kernels.py | 186 | # Define the varying parameter values |
| MEDIUM | …/registered/attention/test_triton_attention_kernels.py | 314 | # Define the varying parameter values |
| MEDIUM | …egistered/attention/test_normal_decode_set_metadata.py | 128 | # Create a simple SWA KV pool for testing |
| MEDIUM | …egistered/attention/test_normal_decode_set_metadata.py | 149 | # Create a minimal mock that inherits from SWAKVPool to pass isinstance check |
| MEDIUM | test/registered/unit/distributed/test_parallel_state.py | 95 | # Create a mock group object |
| MEDIUM | test/registered/unit/distributed/test_parallel_state.py | 195 | # Create a mock group object |
| MEDIUM | …stered/unit/function_call/test_function_call_parser.py | 517 | # Create a concrete implementation of BaseFormatDetector for testing |
| MEDIUM | …tered/unit/mem_cache/test_radix_cache_slru_accuracy.py | 56 | # Create a cache with the memory pools |
| MEDIUM | …ed/unit/mem_cache/test_unified_radix_cache_unittest.py | 1295 | # Create a chain: root -> A -> B -> C (3 levels) |
| MEDIUM | …t/registered/unit/model_loader/test_modelopt_loader.py | 72 | # Create a basic model config with unified quantization flag |
| MEDIUM | …t/registered/unit/model_loader/test_modelopt_loader.py | 116 | # Create a custom load_model method for testing that simulates the real logic |
| MEDIUM | …egistered/unit/entrypoints/openai/test_serving_chat.py | 524 | # Create a mock conversation object that will be returned by generate_chat_conv |
| MEDIUM | …egistered/unit/entrypoints/openai/test_serving_chat.py | 1211 | # Create a mock conversation object |
| MEDIUM | test/registered/cpu/utils.py | 465 | # Create a new tensor with alternating values |
| MEDIUM | test/registered/cpu/utils.py | 488 | # Create a tensor with the E2M1 values |
| MEDIUM | test/registered/cpu/test_mamba.py | 17 | """This function is intended to align with the l2norm implementation in the FLA library.""" |
| MEDIUM | …registered/hicache/test_hicache_storage_3fs_backend.py | 26 | # Create a temporary JSON config file for HF3FS |
| MEDIUM | test/registered/lora/test_chunked_sgmv_backend.py | 189 | # Create a minimal mock ForwardBatch for the test |
| MEDIUM | test/registered/lora/test_chunked_sgmv_backend.py | 221 | # Create a minimal backend instance to access _get_segments_info |
| MEDIUM | test/registered/rl/test_update_weights_from_tensor.py | 136 | # Create a small set of parameters for testing |
| MEDIUM | test/registered/rl/test_update_weights_from_tensor.py | 150 | # Create a flattened bucket |
| MEDIUM | test/registered/rl/test_update_weights_from_tensor.py | 157 | # Create the dict format expected by _update_weights_from_flattened_bucket |
| MEDIUM | test/registered/rl/test_update_weights_from_tensor.py | 167 | # Create a list where each rank contains the same serialized data |
| MEDIUM | python/sglang/bench_serving.py | 1270 | # Create a temporary DatasetRow object for warmup |
| MEDIUM | python/sglang/bench_serving.py | 1286 | # Create the test input once |
| MEDIUM | python/sglang/bench_serving.py | 2309 | # Create a mutually exclusive group for profiling URLs |
| MEDIUM | python/sglang/test/nightly_bench_utils.py | 72 | # Create a combined link or use the first available one |
| MEDIUM | python/sglang/jit_kernel/ngram_embedding.py | 93 | # Create an empty tensor for ignore_tokens |
| MEDIUM | python/sglang/srt/layers/attention/flashmla_backend.py | 524 | # Create a dummy forward_mode for draft step |
| MEDIUM | …/sglang/srt/layers/attention/flashattention_backend.py | 2477 | # Create a modified version for local attention that only processes the last token |
| MEDIUM | …/sglang/srt/layers/attention/flashattention_backend.py | 3262 | # Create a block_table for the local attention blocks |
| MEDIUM | …/sglang/srt/layers/attention/dsa/index_buf_accessor.py | 760 | # Define the token range within the block and the K dimension range handled by the thread. |
| MEDIUM | python/sglang/srt/layers/attention/fla/kda.py | 4 | # This file contains code copied from the flash-linear-attention project. |
| 82 more matches not shown… | |||
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | …shinfer_allreduce_fusion/benchmark_fused_collective.py | 1153 | # Check if running with torchrun (required for collective operations) |
| LOW | benchmark/json_schema/bench_sglang.py | 97 | # Check if the outputs are valid |
| LOW | benchmark/ceval/bench_sglang.py | 112 | # Print results |
| LOW | benchmark/prefill_only/util.py | 163 | # Check if we have any valid requests |
| LOW | benchmark/boolq/bench_sglang.py | 89 | # Print results |
| LOW | benchmark/gsm8k/bench_sglang.py | 143 | # Print results |
| LOW | benchmark/gsm8k/bench_other.py | 129 | # Print results |
| LOW | benchmark/mmlu/bench_sglang.py | 179 | # Print results |
| LOW | test/manual/test_schedule_policy.py | 76 | # Check if FCFS keeps the original order |
| LOW | test/manual/test_schedule_policy.py | 101 | # Check if priority enabled fcfs ordering is applied. |
| LOW | test/manual/test_schedule_policy.py | 127 | # Check if priority enabled fcfs ordering is applied. |
| LOW | test/manual/test_schedule_policy.py | 149 | # Check if priority enabled fcfs ordering is applied. |
| LOW | test/manual/test_schedule_policy.py | 171 | # Check if priority enabled fcfs ordering is applied. |
| LOW | test/manual/test_schedule_policy.py | 195 | # Check if priority enabled fcfs ordering is applied. |
| LOW | …nual/kv_transfer/test_mooncake_transfer_engine_init.py | 201 | # Check if mooncake should be used |
| LOW | …tered/unit/mem_cache/test_radix_cache_slru_accuracy.py | 108 | # Check if the frequently accessed key-value is still present |
| LOW | …tered/unit/mem_cache/test_radix_cache_slru_accuracy.py | 114 | # Check if the first low-frequency key-value has been evicted |
| LOW | …tered/unit/mem_cache/test_radix_cache_slru_accuracy.py | 126 | # Check if the tensor is empty, which indicates the key was not found (evicted) |
| LOW | …t/registered/unit/model_loader/test_modelopt_export.py | 27 | # Check if modelopt is available |
| LOW | …d/unit/batch_invariant_ops/test_batch_invariant_ops.py | 53 | # Check if results are identical |
| LOW | …d/unit/batch_invariant_ops/test_batch_invariant_ops.py | 193 | # Check if results are identical |
| LOW | …registered/disaggregation/test_disaggregation_basic.py | 206 | # Check if servers are still healthy |
| LOW | …/ascend/interface/test_npu_openai_server_ignore_eos.py | 84 | # Check if ignore_eos resulted in more tokens or exactly max_tokens |
| LOW | …nd/basic_function/offloading/test_npu_offload_modes.py | 70 | # Check if server is running (basic functionality test) |
| LOW | …nai_server/validation/test_openai_server_ignore_eos.py | 75 | # Check if ignore_eos resulted in more tokens or exactly max_tokens |
| LOW | test/registered/observability/test_tracing.py | 77 | # Check if it's a RequestStageConfig (has stage_name and level attributes) |
| LOW | …stered/amd/disaggregation/test_disaggregation_basic.py | 322 | # Check if servers are still healthy |
| LOW | …t/registered/amd/accuracy/mi30x/test_gsm8k_eval_amd.py | 286 | # Print results |
| LOW | …gistered/amd/accuracy/mi30x/test_vlms_mmmu_eval_amd.py | 255 | # Print results |
| LOW | …amd/accuracy/mi35x/test_qwen3_coder_next_eval_mi35x.py | 218 | # Check if model exists |
| LOW | …racy/mi35x/test_deepseek_r1_mxfp4_kv_fp8_eval_mi35x.py | 197 | # Check if model exists |
| LOW | …md/accuracy/mi35x/test_deepseek_r1_mxfp4_eval_mi35x.py | 194 | # Check if model exists |
| LOW | …y/mi35x/test_deepseek_r1_mxfp4_ar_fusion_eval_mi35x.py | 196 | # Check if model exists |
| LOW | …/registered/rl/test_update_weights_from_distributed.py | 642 | # Check if the weights of lm_head are tied with embed_tokens. |
| LOW | python/sglang/bench_serving.py | 1309 | # Check if at least one warmup request succeeded |
| LOW | python/sglang/utils.py | 414 | # Check if the cache file already exists |
| LOW | python/sglang/bench_one_batch.py | 662 | # Print output texts |
| LOW | python/sglang/benchmark/utils.py | 106 | # Check if the cache file already exists |
| LOW | python/sglang/test/few_shot_gsm8k_engine.py | 135 | # Print results |
| LOW | python/sglang/test/bench_one_batch_server_internal.py | 732 | # Print results |
| LOW | python/sglang/test/send_one.py | 230 | # Print results |
| LOW | python/sglang/test/runners.py | 497 | # Check if the text is empty or only whitespace. |
| LOW | python/sglang/test/runners.py | 792 | # Check if the text is empty or only whitespace. |
| LOW | python/sglang/test/few_shot_gsm8k.py | 138 | # Print results |
| LOW | python/sglang/test/ci/ci_utils.py | 238 | # Check if we should retry |
| LOW | …/sglang/test/server_fixtures/disaggregation_fixture.py | 228 | # Check if port is active |
| LOW | …sglang/jit_kernel/benchmark/bench_custom_all_reduce.py | 374 | # Print results on rank 0. |
| LOW | python/sglang/jit_kernel/tests/test_pos_enc.py | 480 | # Print results |
| LOW | python/sglang/srt/dllm/mixin/scheduler.py | 245 | # Check if batch is full |
| LOW | python/sglang/srt/disaggregation/decode.py | 1548 | # Check if request was aborted due to corruption |
| LOW | python/sglang/srt/layers/sampler.py | 392 | # Check if any requests actually need logprobs computation |
| LOW | python/sglang/srt/layers/attention/xpu_backend.py | 462 | # Check if we should use local attention |
| LOW | python/sglang/srt/layers/attention/triton_backend.py | 185 | # Set static_kv_splits to False to use deterministic logic instead |
| LOW | …thon/sglang/srt/layers/attention/trtllm_mla_backend.py | 749 | # Check if we're in CUDA graph mode (buffers are pre-allocated) |
| LOW | python/sglang/srt/layers/attention/dsa_backend.py | 579 | # Check if MHA FP8 dequantization is needed |
| LOW | python/sglang/srt/layers/attention/dsa_backend.py | 2249 | # Check if sequence meets criteria for MHA_ONE_SHOT |
| LOW | …/sglang/srt/layers/attention/flashattention_backend.py | 727 | # Check if we should use local attention |
| LOW | …/sglang/srt/layers/attention/flashattention_backend.py | 2981 | # Check if we should use the specialized fast path for page_size=1, no SWA |
| LOW | …thon/sglang/srt/layers/attention/flashinfer_backend.py | 1140 | # Check if this specific wrapper's begin_forward has been replaced with fast_decode_plan |
| LOW | …ng/srt/layers/attention/triton_ops/extend_attention.py | 864 | # Check if we can skip this tile |
| 206 more matches not shown… | |||
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | benchmark/bench_linear_attention/bench_gdn_decode.py | 104 | # Step 1: split (same as forward_decode) |
| LOW | benchmark/bench_linear_attention/bench_gdn_decode.py | 107 | # Step 2: view + reshape (same as forward_decode) |
| LOW | benchmark/bench_linear_attention/bench_gdn_decode.py | 112 | # Step 3: fused gating + recurrent update |
| LOW | …mark/bench_linear_attention/bench_fused_gate_cumsum.py | 79 | # Step 1: gate activation using torch ops |
| LOW | …mark/bench_linear_attention/bench_fused_gate_cumsum.py | 87 | # Step 2: chunk-local cumsum |
| LOW | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 38 | # Step 1: Create kind cluster (idempotent) |
| LOW | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 50 | # Step 2: Build Docker images (unless SKIP_DOCKER_BUILD=1) |
| LOW | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 75 | # Step 3: Load images into kind |
| LOW | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 82 | # Step 4: Apply namespace and RBAC |
| LOW | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 89 | # Step 5: Deploy 3 fake-worker replicas behind a Service |
| LOW | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 144 | # Step 6: Create sgl-router ConfigMap with k8s discovery pointing at the |
| LOW | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 175 | # Step 7: Deploy sgl-router |
| LOW | test/srt/cpu/utils.py | 72 | # Step 2: compute per-block max abs values → scale |
| LOW | test/manual/test_tokenizer_manager.py | 336 | # Step 1: Detect format |
| LOW | test/manual/test_tokenizer_manager.py | 342 | # Step 2: Prepare input |
| LOW | test/manual/test_tokenizer_manager.py | 348 | # Step 3: Extract results (simulated tokenizer output) |
| LOW | test/manual/test_tokenizer_manager.py | 367 | # Step 1: Detect format |
| LOW | test/manual/test_tokenizer_manager.py | 373 | # Step 2: Prepare input |
| LOW | test/manual/test_tokenizer_manager.py | 379 | # Step 3: Extract results (simulated tokenizer output for cross-encoder) |
| LOW | test/manual/test_tokenizer_manager.py | 396 | # Step 1: Detect format |
| LOW | test/manual/test_tokenizer_manager.py | 402 | # Step 2: Prepare input |
| LOW | test/manual/test_tokenizer_manager.py | 408 | # Step 3: Extract results (simulated tokenizer output) |
| LOW | test/manual/dsv4/test_fused_compress_attn_hip.py | 312 | # Step 1: write current tokens to state (same for both paths) |
| LOW | test/registered/unit/managers/test_hisparse_unit.py | 446 | # Step 1: load the first TOP_K positions from host (no newest token — |
| LOW | test/registered/unit/managers/test_hisparse_unit.py | 455 | # Step 2: half overlap (hit) + half new (miss). |
| LOW | test/registered/unit/parser/test_reasoning_parser.py | 1147 | # Step 1: Send partial end tag when not in reasoning mode |
| LOW | test/registered/unit/parser/test_reasoning_parser.py | 1153 | # Step 2: Send normal text that doesn't complete the end tag |
| LOW | …egistered/unit/mem_cache/test_decode_radix_lock_ref.py | 128 | # Step 1: inc_lock_ref (pop_preallocated locks the matched node) |
| LOW | …egistered/unit/mem_cache/test_decode_radix_lock_ref.py | 144 | # Step 2: cache_unfinished_req (dec old lock, inc new lock) |
| LOW | …egistered/unit/mem_cache/test_decode_radix_lock_ref.py | 147 | # Step 3: cache_finished_req with is_insert=True (dec lock) |
| LOW | …egistered/unit/mem_cache/test_decode_radix_lock_ref.py | 176 | # Step 1: inc_lock_ref on root (simulates get_new_prebuilt_batch) |
| LOW | …egistered/unit/mem_cache/test_decode_radix_lock_ref.py | 193 | # Step 2: cache_unfinished_req (dec root=no-op, inc new leaf) |
| LOW | …egistered/unit/mem_cache/test_decode_radix_lock_ref.py | 196 | # Step 3: cache_finished_req (dec leaf) |
| LOW | test/registered/unit/mem_cache/test_mamba_unittest.py | 684 | # Step 1: Insert [1,2,3] to create first node |
| LOW | test/registered/unit/mem_cache/test_mamba_unittest.py | 696 | # Step 2: Insert [1,2,3,4,5,6,7] with prev_prefix_len=0 (free all matched) |
| LOW | test/registered/unit/mem_cache/test_mamba_unittest.py | 713 | # Step 3: Insert [1,2,3,4,5,6,7,8] with prev_prefix_len=2 |
| LOW | test/registered/unit/mem_cache/test_mamba_unittest.py | 731 | # Step 4: Insert [1,2,3,4,5,6,7,8,9] with prev_prefix_len=8 (covers all matched) |
| LOW | …ed/unit/mem_cache/test_unified_radix_cache_unittest.py | 637 | # Step 1: insert 1 page |
| LOW | …ed/unit/mem_cache/test_unified_radix_cache_unittest.py | 641 | # Step 2: insert 2 pages with prev_prefix_len=0 → frees overlap of 1 page |
| LOW | …ed/unit/mem_cache/test_unified_radix_cache_unittest.py | 659 | # Step 3: insert 3 pages with prev_prefix_len=len(seq_2p) → nothing freed |
| LOW | test/registered/cpu/utils.py | 72 | # Step 2: compute per-block max abs values → scale |
| LOW | test/registered/function_call/test_kimik2_detector.py | 549 | # Step 1: reasoning parser |
| LOW | test/registered/function_call/test_kimik2_detector.py | 553 | # Step 2: feed normal_text into tool call parser (like serving_chat.py does) |
| LOW | test/registered/function_call/test_kimik2_detector.py | 585 | # Step 1: reasoning parser |
| LOW | test/registered/function_call/test_kimik2_detector.py | 590 | # Step 2: tool call parser on normal_text |
| LOW | test/registered/debug_utils/test_dumper.py | 4090 | # Step 1: graft input. target sends its real q to baseline along |
| LOW | test/registered/debug_utils/test_dumper.py | 4099 | # Step 2: target runs the (suspected buggy) attention kernel — |
| LOW | test/registered/debug_utils/test_dumper.py | 4103 | # Step 3: graft output. baseline sends its (good) attn_out to |
| LOW | test/registered/debug_utils/test_dumper.py | 4064 | # Step 1: graft input. target sends its q to baseline; baseline's |
| LOW | test/registered/debug_utils/test_dumper.py | 4075 | # Step 2: baseline runs the known-good attention kernel. |
| LOW | test/registered/debug_utils/test_dumper.py | 4078 | # Step 3: graft output. baseline sends attn_out to target with a |
| LOW | …t/registered/debug_utils/comparator/test_entrypoint.py | 2206 | # Step 0: prefill with 2 sequences (3+2 tokens) |
| LOW | …t/registered/debug_utils/comparator/test_entrypoint.py | 2215 | # Step 1: decode (1 token per sequence) |
| LOW | …t/registered/debug_utils/comparator/test_entrypoint.py | 2267 | # Step 0: prefill — seq A (3 tokens) + seq B (2 tokens) |
| LOW | …t/registered/debug_utils/comparator/test_entrypoint.py | 4720 | # Step 1: Create 4 target ranks where moe_tp is absent from ranks 2-3. |
| LOW | …t/registered/debug_utils/comparator/test_entrypoint.py | 4723 | # Step 2: _is_jointly_determined is called with parent_axes={tp, moe_tp} |
| LOW | …t/registered/debug_utils/comparator/test_entrypoint.py | 4725 | # Step 3: edp remains undeclared → ValueError emitted as error record. |
| LOW | …t/registered/debug_utils/comparator/test_entrypoint.py | 4822 | # Step 1: 4 target ranks with TP=2, CP=2 (replicated), EDP=4. |
| LOW | …t/registered/debug_utils/comparator/test_entrypoint.py | 4824 | # Step 2: _is_jointly_determined is called with parent_axes={tp, cp}, child=edp. |
| LOW | …t/registered/debug_utils/comparator/test_entrypoint.py | 4826 | # Step 3: CP replicated picks one rank per tp group → TP concat → correct shape. |
| 179 more matches not shown… | |||
| Severity | File | Line | Snippet |
|---|---|---|---|
| HIGH | python/sglang/test/ascend/test_ascend_utils.py | 467 | Start the service and obtain the inference results. Parameters: model: Model name num_prompts: Tota |
| HIGH | python/sglang/srt/server_args_config_parser.py | 52 | Merge configuration file arguments with command-line arguments. Configuration arguments are inserted a |
| HIGH | python/sglang/srt/server_args_config_parser.py | 101 | Parse YAML configuration file and convert to argument list. Args: file_path: Path to the Y |
| HIGH | python/sglang/srt/layers/model_parallel.py | 29 | Locally shards a full tensor based on indicated sharding arrangement, and returns a DTensor containing the loca |
| HIGH | …thon/sglang/srt/layers/attention/flashinfer_backend.py | 335 | Process multi-item scoring tensors for FlashInfer attention. This method handles sequences containing multiple |
| HIGH | python/sglang/srt/layers/attention/fla/chunk.py | 146 | Args: q (torch.Tensor): queries of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K |
| HIGH | …hon/sglang/srt/layers/attention/fla/fused_recurrent.py | 459 | Args: q (torch.Tensor): queries of shape `[B, T, H, K]`. k (torch.Tensor): |
| HIGH | python/sglang/srt/layers/quantization/fp8_kernel.py | 1638 | Quantize input tensor to FP8 (8-bit floating point) format. Args: input (torch.Tensor): Input tensor to be quantiz |
| HIGH | python/sglang/srt/layers/moe/cutlass_moe.py | 55 | Performs Fused MoE computation using CUTLASS-like kernels with FP8 weights and activations. This function implement |
| HIGH | python/sglang/srt/function_call/utils.py | 223 | Get consolidated $defs from all tools, validating for conflicts. Args: tools: List of tools to process |
| HIGH | python/sglang/srt/utils/network.py | 119 | Bind a TCP socket on the first available address family (IPv4/IPv6). Iterates over address families returned by _ge |
| HIGH | python/sglang/srt/utils/network.py | 309 | Automatically detect the local IP address using multiple fallback strategies. This function attempts to obtain |
| HIGH | python/sglang/srt/utils/nvtx_pytorch_hooks.py | 46 | Descends iterators that contains Tensors and prints the Tensor Recursive function that descends iterator type a |
| HIGH | python/sglang/srt/utils/nvtx_pytorch_hooks.py | 74 | Extract the static parameters from LLM and VLM relevant layer types Args: module_obj(class): Module |
| HIGH | python/sglang/srt/utils/nvtx_pytorch_hooks.py | 198 | Callback function that ends the NVTX marker Records the module name and tensor information Called after |
| HIGH | python/sglang/srt/utils/nvtx_pytorch_hooks.py | 218 | Creates an NVTX marker with the module name in it. This function is called before the module executes |
| HIGH | python/sglang/srt/utils/nvtx_pytorch_hooks.py | 256 | User level function that activates all the hooks The user needs to call this method from the network source cod |
| HIGH | python/sglang/srt/models/minicpmo.py | 134 | In streaming audio generation, determine which `text` positions the TTS model can attend to when generating each ch |
| HIGH | python/sglang/srt/models/minicpmo.py | 613 | Merge `input_ids` and `lm_spk_emb_last_hidden_states` to `inputs_embeds`. Args: input_ids (torch.Te |
| HIGH | python/sglang/srt/multimodal/processors/qwen_vl.py | 133 | calculate the number of frames for video used for model inputs. Args: ele (dict): a dict contains the confi |
| HIGH | python/sglang/srt/multimodal/processors/ernie45_vl.py | 133 | calculate the number of frames for video used for model inputs. Args: ele (dict): a dict contains the confi |
| HIGH | python/sglang/srt/mem_cache/storage/backend_factory.py | 73 | Create a storage backend instance. Args: backend_name: Name of the backend to create sto |
| HIGH | python/sglang/srt/model_loader/ci_weight_validation.py | 1840 | CI-specific download with validation and automatic retry on corruption. This function handles the download of |
| HIGH | python/sglang/srt/speculative/dflash_utils.py | 164 | Select target layer indices used to build DFlash context features. Args: num_target_layers: Number of trans |
| HIGH | python/sglang/srt/speculative/dflash_utils.py | 427 | Compute DFlash accept lengths and bonus tokens (greedy verify rule). Args: candidates: Token ids proposed b |
| HIGH | …/runtime/managers/memory_managers/layerwise_offload.py | 424 | Update consolidated CPU buffers with new weights. When layerwise offload (--dit-layerwise-offload) is enabled, |
| HIGH | …ang/multimodal_gen/runtime/utils/nvtx_pytorch_hooks.py | 97 | Walk ``model`` and attach forward pre/post hooks to every module. Args: model: Root module to instr |
| HIGH | …ng/multimodal_gen/runtime/models/encoders/qwen2_5vl.py | 554 | Calculate the 3D rope index based on image and video's temporal, height and width in LLM. Explanation: |
| HIGH | …imodal_gen/runtime/pipelines_core/stages/validators.py | 345 | Add a validation check for a field. Args: field_name: Name of the field being checked |
| HIGH | sgl-model-gateway/e2e_test/infra/run_eval.py | 64 | Run an evaluation and return metrics. Args: args: Configuration object with attributes: - base_ |
| HIGH | sgl-model-gateway/e2e_test/infra/model_pool.py | 727 | Get a model instance by model_id, mode, and worker_type. If the model is not running, it will be launched on-de |
| HIGH | sgl-model-gateway/e2e_test/responses/test_basic_crud.py | 24 | Wait for background task to complete. Args: client: OpenAI client response_id: Response ID to poll |
| Severity | File | Line | Snippet |
|---|---|---|---|
| MEDIUM | docker/k8s-sglang-distributed-sts.yaml | 58 | privileged: true # to leverage RDMA/InfiniBand device, co-work with HostNetwork=true |
| MEDIUM | …t/manual/layers/attention/dsa/test_act_quant_triton.py | 269 | # Run comprehensive benchmark |
| MEDIUM | test/manual/layers/moe/bench_mxfp4_sm90_kernels.py | 277 | # Timing harness |
| MEDIUM | test/manual/4-gpu-models/test_qwen35_models_archived.py | 17 | # This eval harness applies the chat_template, which is critical for qwen3.5 |
| MEDIUM | test/manual/4-gpu-models/test_qwen35_fp4_triton.py | 5 | # This eval harness applies the chat_template, which is critical for qwen3.5 |
| MEDIUM | test/manual/eval/validate_longbench_v2_standalone.py | 212 | """Generate comprehensive validation report.""" |
| MEDIUM | …tered/attention/unittests/dense/test_flex_attention.py | 65 | # from logical positions, so it's robust to all non-tidy layouts. |
| MEDIUM | test/registered/attention/unittests/dense/test_fa3.py | 51 | # FlashAttention kernels are most stable in this harness with FA-friendly dims. |
| MEDIUM | test/registered/attention/unittests/dense/test_fa4.py | 46 | # FlashAttention kernels are most stable in this harness with FA-friendly dims. |
| MEDIUM | …egistered/attention/unittests/swa/test_torch_native.py | 104 | # arithmetic, so it's robust to all non-tidy layouts. |
| MEDIUM | test/registered/hicache/test_qwen35_hicache.py | 15 | # This eval harness applies the chat_template, which is critical for qwen3.5 |
| LOW | test/registered/debug_utils/test_dumper.py | 3052 | # code can simply call `from sglang.srt.debug_utils.dumper import dumper` |
| MEDIUM | …attention_unittest/attention_methods/dsv4_attention.py | 1255 | # In runner-harness flows the reference is called BEFORE |
| LOW | python/sglang/srt/disaggregation/nixl/conn.py | 101 | ), # hacky just add it into the message that will be sent |
| MEDIUM | python/sglang/srt/disaggregation/common/conn.py | 929 | # Enable higher PP ranks to be bootstrapped earlier to make PP PD requests bootstrap more robust |
| LOW | …/sglang/srt/layers/attention/flashattention_backend.py | 3196 | # then we can simply use a cdiv for the rest. |
| LOW | …ang/srt/layers/attention/mamba/causal_conv1d_triton.py | 152 | # first chunk and does not have prior-token, so just set to 0 |
| MEDIUM | …hon/sglang/srt/layers/attention/dsa/tilelang_kernel.py | 1102 | # to better utilize FP8 dynamic range, then apply the inverse scale after GEMM. |
| MEDIUM | python/sglang/srt/layers/quantization/fpgemm_fp8.py | 47 | # For GPUs that lack FP8 hardware suspport, we can leverage the Marlin |
| MEDIUM | python/sglang/srt/layers/quantization/fp8.py | 230 | # Keep both "model." and non-"model." variants for robust prefix matching. |
| MEDIUM | python/sglang/srt/layers/quantization/fp8.py | 336 | # For GPUs that lack FP8 hardware support, we can leverage the Marlin |
| MEDIUM | python/sglang/srt/layers/quantization/fp8_utils.py | 622 | # TODO: add more robust shape check here |
| MEDIUM | …hon/sglang/srt/layers/quantization/marlin_utils_fp8.py | 69 | # For GPUs that lack FP8 hardware support, we can leverage the |
| LOW | …srt/layers/quantization/mxfp4_flashinfer_trtllm_moe.py | 451 | # in-place. Otherwise `routed` is already scale-final and we just add |
| LOW | python/sglang/srt/managers/io_struct.py | 760 | lora_id: Optional[str] = None # None means just use the base model |
| LOW | python/sglang/srt/managers/io_struct.py | 1057 | lora_id: Optional[str] = None # None means just use the base model |
| LOW | python/sglang/srt/managers/scheduler_pp_mixin.py | 438 | # otherwise, just pass along previous consensus |
| LOW | python/sglang/srt/managers/scheduler.py | 3661 | # In-place pause: just set the flag and return immediately. |
| LOW | python/sglang/srt/function_call/qwen3_coder_detector.py | 41 | # Base class already initializes _buffer, we just use it directly |
| LOW | python/sglang/srt/parser/harmony_parser.py | 492 | # If no emit, just return the held content |
| LOW | python/sglang/srt/mem_cache/hiradix_cache.py | 1143 | # unknown prefetch stop policy, just return True |
| MEDIUM | …n/sglang/multimodal_gen/test/server/accuracy_config.py | 57 | # of real divergence or unsupported reference loading in the harness. |
| LOW | python/sglang/multimodal_gen/test/server/gpu_cases.py | 60 | # To test different models, simply add more DiffusionCase entries |
| MEDIUM | …on/sglang/multimodal_gen/test/server/accuracy_hooks.py | 26 | # These are harness defaults for synthetic accuracy inputs. |
| MEDIUM | …lang/multimodal_gen/runtime/layers/quantization/fp8.py | 191 | # For GPUs that lack FP8 hardware support, we can leverage the Marlin |
| MEDIUM | …imodal_gen/runtime/pipelines_core/stages/validators.py | 503 | """Get a comprehensive summary of all validation failures.""" |
| MEDIUM | sgl-kernel/csrc/cpu/decode.cpp | 1429 | // use smaller BLOCK_H when batches is small to utilize all cores |
| MEDIUM | …del-gateway/e2e_test/k8s_integration/test_lifecycle.py | 374 | # "total count == 0" keeps the test robust if a parallel/earlier |
| MEDIUM | …way/e2e_test/chat_completions/test_function_calling.py | 455 | # Make the test more robust by checking type and accepting valid responses |
| MEDIUM | …l-gateway/bindings/python/src/sglang_router/mini_lb.py | 440 | # We may utilize `GenerateReqInput`'s logic later |
| Severity | File | Line | Snippet |
|---|---|---|---|
| HIGH | benchmark/hicache/bench_mix.py | 465 | self.user_generator.push( |
| HIGH | experimental/sgl-router/tests/e2e/infra/gateway.py | 353 | "Build it first: `cd experimental/sgl-router && cargo build --release` " |
| HIGH | test/manual/models/test_qwen3_asr.py | 630 | f"first commit's previous_item_id must be JSON null, got {committed_1!r}", |
| HIGH | test/manual/debug_utils/test_log_parser.py | 16 | """[{"line":"(SGLangEngine pid=35555) [2025-10-31 03:45:20 TP0] Decode batch [51341], #running-req: 317, #to |
| HIGH | …tion_unittest/runner_modes/cuda_graph_decode_runner.py | 417 | # capture, which makes the capture-time output undefined; only |
| HIGH | …t/layers/quantization/nvfp4_gemm_swiglu_nvfp4_quant.py | 2552 | :param c_sf_ptr: Pointer to scale factor tensor for C (can be null) |
| HIGH | …t/layers/quantization/nvfp4_gemm_swiglu_nvfp4_quant.py | 2556 | :param norm_const_ptr: Pointer to normalization constant for SFC generation (can be null) |
| HIGH | python/sglang/srt/function_call/utils.py | 297 | return "string" # If only null, default to string |
| HIGH | python/sglang/srt/function_call/mimo_detector.py | 144 | <parameter=command>pwd && ls</parameter> |
| HIGH | python/sglang/srt/utils/field_validators.py | 66 | raise ValueError(f"must be list or null; got {type(v).__name__}") |
| HIGH | python/sglang/srt/utils/hf_transformers/tokenizer.py | 366 | # When the config omits the key or has null, use the v4 default so that |
| HIGH | python/sglang/srt/entrypoints/http_server.py | 429 | {"error": {"message": "...", "type": "...", "param": null, "code": <status>}} |
| HIGH | …al-refactor-verify/mechanical_refactor_verify_utils.py | 30 | exec_command(f"git add -A && git commit -m {shlex.quote(message)}", cwd=cwd) |
| HIGH | scripts/ci/utils/runner_utilization_report.py | 188 | equal to `created_at` (not null). The previous code required both a |
| Severity | File | Line | Snippet |
|---|---|---|---|
| CRITICAL | …tention_unittest/attention_methods/mamba2_attention.py | 537 | self.mixer.norm.weight.data = self.mixer.norm.weight.data.float() |
| CRITICAL | python/sglang/srt/layers/attention/fla/utils.py | 231 | return triton.runtime.driver.active.utils.get_device_properties(tensor_idx)[ |
| CRITICAL | python/sglang/srt/layers/attention/fla/utils.py | 286 | triton.runtime.driver.active.utils.get_device_properties(i)[ |
| CRITICAL | python/sglang/srt/models/glm4_moe.py | 490 | ) and self.shared_experts.gate_up_proj.quant_method.quant_config.get_name() in { |
| CRITICAL | python/sglang/srt/models/deepseek_v2.py | 734 | ) and self.shared_experts.gate_up_proj.quant_method.quant_config.get_name() in { |
| CRITICAL | …l_gen/runtime/pipelines_core/composed_pipeline_base.py | 124 | task_name = self.server_args.pipeline_config.task_type.name.lower() |
| Severity | File | Line | Snippet |
|---|---|---|---|
| MEDIUM | …manual/openai_server/function_call/test_tool_choice.py | 845 | # # Start the local OpenAI Server. If necessary, you can add other parameters such as --enable-tools. |
| MEDIUM | …d/ascend/interface/test_npu_openai_function_calling.py | 53 | # Start the local OpenAI Server. If necessary, you can add other parameters such as --enable-tools. |
| MEDIUM | …i_server/function_call/test_openai_function_calling.py | 43 | # Start the local OpenAI Server. If necessary, you can add other parameters such as --enable-tools. |
| MEDIUM | …i_server/function_call/test_openai_function_calling.py | 926 | # # Start the local OpenAI Server. If necessary, you can add other parameters such as --enable-tools. |
| LOW | python/sglang/test/runners.py | 488 | # make sure to disable compile |
| MEDIUM | python/sglang/jit_kernel/flash_attention_v3.py | 92 | # That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3. |
| LOW | python/sglang/srt/models/hunyuan.py | 784 | # make sure to leave KV cache scale factors in a known good (dummy) state |
| LOW | python/sglang/srt/models/apertus.py | 384 | # make sure to leave KV cache scale factors in a known good (dummy) state |
| LOW | python/sglang/srt/models/mimo_v2.py | 968 | # make sure to leave KV cache scale factors in a known good (dummy) state |
| LOW | python/sglang/srt/models/llama.py | 436 | # make sure to leave KV cache scale factors in a known good (dummy) state |
| LOW | python/sglang/srt/models/glm4.py | 398 | # make sure to leave KV cache scale factors in a known good (dummy) state |
| LOW | python/sglang/srt/models/qwen2.py | 403 | # make sure to leave KV cache scale factors in a known good (dummy) state |
| LOW | …glang/multimodal_gen/runtime/models/vaes/hunyuanvae.py | 191 | # If you are encountering an error here, make sure to try running encoding/decoding with |
| MEDIUM | sgl-kernel/python/sgl_kernel/flash_attn.py | 24 | # That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3. |
| MEDIUM | sgl-kernel/tests/test_flash_attention.py | 28 | # That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3. |
| MEDIUM | …ntend_language/quick_start/local_example_llava_next.py | 55 | # Or you can use the 72B model |
| MEDIUM | …ntend_language/quick_start/local_example_llava_next.py | 62 | # Or you can use API models |
| MEDIUM | …-gateway/examples/wasm/wasm-guest-ratelimit/src/lib.rs | 37 | // This is a simplified example for demonstration purposes |
| Severity | File | Line | Snippet |
|---|---|---|---|
| MEDIUM | python/sglang/srt/entrypoints/openai/serving_base.py | 146 | |
| MEDIUM | python/sglang/srt/entrypoints/openai/serving_base.py | 149 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 706 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 707 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 708 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 711 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 712 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 714 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 716 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 719 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 720 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 722 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 732 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 747 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 757 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 758 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 759 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 787 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 788 | |
| MEDIUM | …ang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 790 |
| Severity | File | Line | Snippet |
|---|---|---|---|
| HIGH | test/manual/attention/test_trtllm_mla_backend.py | 1238 | # Reshape as requested |
| HIGH | …ang/multimodal_gen/runtime/utils/hf_diffusers_utils.py | 413 | # Special handling for stop token <|eom_id|> generated by llama 3 tool use. |
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | …l/sgl-router/tests/scripts/generate_parity_fixtures.py | 41 | "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod " |
| LOW | …l/sgl-router/tests/scripts/generate_parity_fixtures.py | 41 | "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod " |
| LOW | python/sglang/test/kits/ebnf_constrained_kit.py | 62 | self.__class__.ebnf_grammar = 'root ::= "user@example.com"' |
| LOW | …-model-gateway/tests/security/auth_integration_test.rs | 174 | let claims = create_claims("admin@example.com", vec!["admin"]); |
| LOW | …-model-gateway/tests/security/auth_integration_test.rs | 187 | assert_eq!(validated.subject, "admin@example.com"); |
| LOW | …-model-gateway/tests/security/auth_integration_test.rs | 213 | let claims = create_claims("user@example.com", vec!["user", "viewer"]); |
| LOW | …-model-gateway/tests/security/auth_integration_test.rs | 221 | assert_eq!(validated.subject, "user@example.com"); |
| LOW | …-model-gateway/tests/security/auth_integration_test.rs | 252 | sub: "user@example.com".to_string(), |
| LOW | …-model-gateway/tests/security/auth_integration_test.rs | 301 | sub: "user@example.com".to_string(), |
| LOW | …-model-gateway/tests/security/auth_integration_test.rs | 338 | let claims = create_claims("user@example.com", vec!["admin"]); |
| LOW | …-model-gateway/tests/security/auth_integration_test.rs | 590 | let claims = create_claims("user@example.com", vec!["admin"]); |
| LOW | …-model-gateway/tests/security/auth_integration_test.rs | 713 | let claims = create_claims("user@example.com", vec!["admin"]); |
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | …rimental/sgl-router/tests/e2e/k8s_integration/setup.sh | 6 | # Usage: |
| LOW | scripts/killall_sglang.sh | 8 | # Usage: |
| LOW | scripts/ci/musa/rename_wheels_musa.sh | 10 | # Usage: |
| LOW | sgl-model-gateway/e2e_test/k8s_integration/setup.sh | 8 | # Usage: |
| LOW | …ngs/golang/examples/oai_server/scripts/analyze_tpot.sh | 6 | # Usage: |
| Severity | File | Line | Snippet |
|---|---|---|---|
| LOW | benchmark/hicache/bench_mix.py | 417 | async def handle_request(self, user_data): |
| LOW | benchmark/hicache/bench_multiturn.py | 388 | async def handle_request(self, item): |
| LOW | python/sglang/srt/debug_utils/dumper.py | 1329 | def handle_request(self, *, method: str, body: dict[str, Any]) -> list[dict]: |
| LOW | python/sglang/srt/entrypoints/openai/serving_base.py | 73 | async def handle_request( |