Repository Analysis

datalab-to/marker

Convert PDF to markdown + JSON quickly with high accuracy

4.2 Likely human-written View on GitHub
4.2
Adjusted Score
4.2
Raw Score
100%
Time Factor
2026-05-05
Last Push
35,556
Stars
Python
Language
60,453
Lines of Code
233
Files
233
Pattern Hits
2026-05-31
Scan Date

Score History

Severity Breakdown

CRITICAL 0HIGH 0MEDIUM 20LOW 213

Pattern Findings

233 matches across 9 categories. Click a row to expand file-level details.

Excessive Try-Catch Wrapping62 hits · 68 pts
SeverityFileLineSnippet
LOWexamples/marker_modal_deployment.py92 except Exception as e:
LOWexamples/marker_modal_deployment.py117 except Exception as e:
LOWexamples/marker_modal_deployment.py286 except Exception as e:
LOWexamples/marker_modal_deployment.py343 except Exception as e:
MEDIUMexamples/marker_modal_deployment.py344 print(f"Error getting web URL: {e}")
LOWexamples/marker_modal_deployment.py363 except Exception as e:
MEDIUMexamples/marker_modal_deployment.py394 print(f"Error: {response.text}")
LOWexamples/marker_modal_deployment.py396 except Exception as e:
LOWbenchmarks/table/inference.py156 except Exception as e:
LOWbenchmarks/overall/elo.py119 except Exception as e:
MEDIUMbenchmarks/overall/elo.py120 print(f"Error: {e}")
LOWbenchmarks/overall/elo.py159 except Exception as e:
MEDIUMbenchmarks/overall/elo.py160 print(f"Error: {e}")
LOWbenchmarks/overall/overall.py60 except Exception as e:
LOWbenchmarks/overall/overall.py72 except Exception as e:
LOWbenchmarks/overall/methods/olmocr.py66 except Exception:
LOWbenchmarks/overall/methods/olmocr.py69 except Exception:
LOWbenchmarks/overall/scorers/clean.py87 except Exception as e:
MEDIUMbenchmarks/overall/scorers/clean.py78def standardize_math(self, match):
MEDIUMbenchmarks/overall/download/base.py48 print(f"Error with sample {idx}: {e}")
LOWbenchmarks/overall/download/base.py50 except Exception as e:
MEDIUMbenchmarks/overall/download/base.py51 print(f"Error with sample {idx}: {e}")
LOWbenchmarks/overall/display/dataset.py34 except Exception as e:
LOWmarker/builders/ocr.py369 except Exception:
LOWmarker/config/parser.py148 except Exception as e:
LOWmarker/config/parser.py159 except Exception as e:
LOWmarker/providers/spreadsheet.py40 except Exception as e:
LOWmarker/providers/registry.py79 except Exception:
LOWmarker/providers/html.py16 except Exception as e:
LOWmarker/providers/pdf.py449 except Exception:
LOWmarker/providers/powerpoint.py53 except Exception as e:
LOWmarker/providers/powerpoint.py223 except Exception as e:
LOWmarker/providers/document.py61 except Exception as e:
LOWmarker/providers/document.py100 except Exception as e:
MEDIUMmarker/providers/document.py89def convert_image(match):
LOWmarker/providers/epub.py56 except Exception as e:
LOWmarker/utils/gpu.py122 except Exception as e:
MEDIUMmarker/utils/gpu.py98def stop_mps_server(self) -> None:
LOWmarker/processors/block_relabel.py55 except Exception as e:
LOWmarker/processors/llm/llm_meta.py60 except Exception as e:
LOWmarker/processors/llm/__init__.py127 except Exception as e:
LOWmarker/processors/llm/__init__.py143 except Exception as e:
LOWmarker/processors/llm/__init__.py189 except Exception as e:
LOWmarker/processors/llm/llm_page_correction.py196 except Exception as e:
LOWmarker/processors/llm/llm_page_correction.py217 except Exception as e:
LOWmarker/processors/llm/llm_page_correction.py264 except Exception as e:
LOWmarker/scripts/server.py106 except Exception as e:
LOWmarker/scripts/convert.py56 except Exception:
LOWmarker/scripts/convert.py96 except Exception as e:
LOWmarker/scripts/extraction_app.py166 except Exception as e:
LOWmarker/scripts/extraction_app.py228 except Exception as e:
LOWmarker/scripts/file_to_s3.py38 except Exception as e:
MEDIUMmarker/scripts/file_to_s3.py39 print(f"Error uploading {filepath}: {str(e)}")
LOWmarker/scripts/common.py156 except Exception as e:
MEDIUMmarker/scripts/common.py157 print(f"Error parsing schema: {e}")
LOWmarker/services/claude.py52 except Exception:
LOWmarker/services/claude.py58 except Exception:
LOWmarker/services/claude.py130 except Exception as e:
LOWmarker/services/gemini.py126 except Exception as e:
LOWmarker/services/openai.py123 except Exception as e:
2 more matches not shown…
Unused Imports59 hits · 58 pts
SeverityFileLineSnippet
LOWbenchmarks/table/inference.py14
LOWbenchmarks/table/inference.py15
LOWbenchmarks/table/table.py6
LOWbenchmarks/overall/elo.py3
LOWbenchmarks/overall/elo.py5
LOWbenchmarks/overall/elo.py6
LOWbenchmarks/overall/elo.py6
LOWbenchmarks/overall/elo.py6
LOWbenchmarks/overall/elo.py18
LOWbenchmarks/overall/methods/__init__.py2
LOWbenchmarks/overall/methods/__init__.py4
LOWbenchmarks/overall/methods/olmocr.py7
LOWbenchmarks/overall/scorers/llm.py13
LOWbenchmarks/overall/scorers/schema.py1
LOWbenchmarks/overall/display/table.py2
LOWmarker/builders/line.py6
LOWmarker/schema/polygon.py1
LOWmarker/schema/polygon.py5
LOWmarker/schema/document.py1
LOWmarker/schema/blocks/__init__.py1
LOWmarker/schema/blocks/__init__.py3
LOWmarker/schema/blocks/__init__.py3
LOWmarker/schema/blocks/__init__.py3
LOWmarker/schema/blocks/__init__.py4
LOWmarker/schema/blocks/__init__.py5
LOWmarker/schema/blocks/__init__.py6
LOWmarker/schema/blocks/__init__.py7
LOWmarker/schema/blocks/__init__.py8
LOWmarker/schema/blocks/__init__.py9
LOWmarker/schema/blocks/__init__.py10
LOWmarker/schema/blocks/__init__.py11
LOWmarker/schema/blocks/__init__.py12
LOWmarker/schema/blocks/__init__.py13
LOWmarker/schema/blocks/__init__.py14
LOWmarker/schema/blocks/__init__.py15
LOWmarker/schema/blocks/__init__.py16
LOWmarker/schema/blocks/__init__.py17
LOWmarker/schema/blocks/__init__.py18
LOWmarker/schema/blocks/__init__.py19
LOWmarker/schema/blocks/__init__.py20
LOWmarker/schema/blocks/__init__.py21
LOWmarker/schema/blocks/__init__.py22
LOWmarker/schema/blocks/form.py1
LOWmarker/schema/blocks/base.py1
LOWmarker/schema/groups/__init__.py1
LOWmarker/schema/groups/__init__.py2
LOWmarker/schema/groups/__init__.py3
LOWmarker/schema/groups/__init__.py4
LOWmarker/schema/groups/__init__.py5
LOWmarker/schema/groups/__init__.py6
LOWmarker/schema/text/__init__.py1
LOWmarker/schema/text/__init__.py2
LOWmarker/processors/util.py1
LOWmarker/processors/order.py1
LOWmarker/scripts/file_to_s3.py1
LOWmarker/scripts/file_to_s3.py2
LOWmarker/scripts/file_to_s3.py3
LOWmarker/scripts/file_to_s3.py7
LOWmarker/scripts/chunk_convert.py4
Deep Nesting49 hits · 49 pts
SeverityFileLineSnippet
LOWtests/conftest.py130
LOWbenchmarks/table/inference.py45
LOWbenchmarks/overall/elo.py180
LOWbenchmarks/overall/overall.py24
LOWbenchmarks/overall/scorers/heuristic.py50
LOWbenchmarks/overall/display/dataset.py11
LOWbenchmarks/overall/display/table.py17
LOWmarker/output.py55
LOWmarker/renderers/ocr_json.py55
LOWmarker/renderers/html.py50
LOWmarker/renderers/html.py81
LOWmarker/renderers/markdown.py29
LOWmarker/renderers/markdown.py120
LOWmarker/builders/layout.py94
LOWmarker/builders/ocr.py165
LOWmarker/builders/ocr.py252
LOWmarker/config/printer.py9
LOWmarker/config/crawler.py34
LOWmarker/config/crawler.py99
LOWmarker/providers/spreadsheet.py81
LOWmarker/providers/pdf.py127
LOWmarker/providers/pdf.py202
LOWmarker/providers/powerpoint.py64
LOWmarker/providers/powerpoint.py140
LOWmarker/converters/__init__.py24
LOWmarker/converters/pdf.py154
LOWmarker/schema/polygon.py78
LOWmarker/schema/polygon.py117
LOWmarker/schema/polygon.py193
LOWmarker/schema/groups/page.py262
LOWmarker/schema/text/line.py60
LOWmarker/schema/text/span.py82
LOWmarker/processors/list.py57
LOWmarker/processors/ignoretext.py44
LOWmarker/processors/order.py15
LOWmarker/processors/debug.py62
LOWmarker/processors/blockquote.py35
LOWmarker/processors/table.py85
LOWmarker/processors/table.py234
LOWmarker/processors/table.py312
LOWmarker/processors/table.py457
LOWmarker/processors/table.py604
LOWmarker/processors/line_numbers.py40
LOWmarker/processors/line_numbers.py76
LOWmarker/processors/sectionheader.py38
LOWmarker/processors/llm/llm_table_merge.py156
LOWmarker/processors/llm/llm_table.py249
LOWmarker/scripts/common.py96
LOWmarker/services/gemini.py43
Hyper-Verbose Identifiers37 hits · 38 pts
SeverityFileLineSnippet
LOWtests/renderers/test_html_renderer.py14def test_html_renderer_block_ids(pdf_document, config):
LOWtests/renderers/test_markdown_renderer.py18def test_markdown_renderer_auto_ocr(pdf_document):
LOWtests/renderers/test_markdown_renderer.py27def test_markdown_renderer_pagination(pdf_document):
LOWtests/renderers/test_markdown_renderer.py36def test_markdown_renderer_pagination_blank_last_page(pdf_document):
LOWtests/renderers/test_markdown_renderer.py52def test_markdown_renderer_metadata(pdf_document):
LOWtests/renderers/test_markdown_renderer.py59def test_markdown_renderer_images(pdf_document):
LOWtests/renderers/test_markdown_renderer.py68def test_markdown_renderer_tables(pdf_document):
LOWtests/renderers/test_json_renderer.py7def test_markdown_renderer_pagination(pdf_document):
LOWtests/renderers/test_extract_images.py8def test_disable_extract_images(pdf_document):
LOWtests/builders/test_ocr_pipeline.py44def test_ocr_with_inline_pipeline(pdf_document):
LOWtests/builders/test_document_builder.py28def test_document_builder_inline_eq(pdf_document):
LOWtests/providers/test_image_provider.py13def test_image_provider_conversion(pdf_converter, temp_image):
LOWtests/converters/test_extraction_converter.py52def test_extraction_converter(config, model_dict, mock_llm_service, temp_doc):
LOWtests/converters/test_extraction_converter.py66def test_extraction_converter_multiple_pages(extraction_converter, temp_doc):
LOWtests/processors/test_llm_processors.py19def test_llm_form_processor_no_config(pdf_document, llm_service):
LOWtests/processors/test_llm_processors.py30def test_llm_form_processor_no_cells(pdf_document, llm_service):
LOWtests/processors/test_llm_processors.py107def test_llm_caption_processor_disabled(pdf_document):
LOWtests/processors/test_llm_processors.py119def test_llm_caption_processor(pdf_document):
LOWtests/processors/test_llm_processors.py141def test_llm_complex_region_processor(pdf_document):
LOWtests/processors/test_llm_processors.py167def test_multi_llm_processors(pdf_document):
LOWtests/processors/test_table_merge.py11def test_llm_table_processor_nomerge(pdf_document, table_rec_model, recognition_model, detection_model, mocker):
LOWtests/processors/test_document_toc_processor.py7def test_document_toc_processor(pdf_document, detection_model, recognition_model, table_rec_model):
LOWtests/processors/test_ignoretext.py10def test_ignoretext_processor(pdf_document):
LOWexamples/marker_modal_deployment.py33def setup_models_with_cache_check(logger, commit_volume=False):
LOWmarker/renderers/__init__.py117 def generate_document_metadata(self, document: Document, document_output):
LOWmarker/renderers/chunk.py36def assemble_html_with_images(block: JSONBlockOutput, image_blocks: set[str]) -> str:
LOWmarker/renderers/json.py29def reformat_section_hierarchy(section_hierarchy):
LOWmarker/builders/ocr.py96 def get_recognition_batch_size(self):
LOWmarker/builders/ocr.py105 def select_ocr_blocks_by_mode(
LOWmarker/builders/ocr.py120 def get_ocr_images_polygons_ids(
LOWmarker/config/crawler.py64 def _gather_super_annotations(cls: Type) -> Dict[str, Type]:
LOWmarker/utils/batch.py4def get_batch_sizes_worker_counts(gpu_manager: GPUManager, peak_worker_vram: int):
LOWmarker/schema/groups/page.py139 def compute_line_block_intersections(
LOWmarker/schema/groups/page.py163 def compute_max_structure_block_intersection_pct(self):
LOWmarker/processors/table.py705 def get_recognition_batch_size(self):
LOWmarker/processors/line_numbers.py61 def ignore_line_number_blocks(self, document: Document):
LOWmarker/scripts/common.py96def extract_root_pydantic_class(schema_code: str) -> Optional[str]:
Redundant / Tautological Comments14 hits · 17 pts
SeverityFileLineSnippet
LOWexamples/marker_modal_deployment.py41 # Check if models exist in cache
LOWexamples/marker_modal_deployment.py207 # Read file content
LOWmarker/providers/spreadsheet.py95 # Check if this cell is the start of a merged range
LOWmarker/providers/registry.py76 # Check if there are any HTML tags
LOWmarker/schema/groups/page.py60 # Check if RGB, convert if needed
LOWmarker/processors/footnote.py25 # Check if it is top-level
LOWmarker/processors/llm/llm_mathblock.py126 # Check if the ratio of math blocks to additional blocks is high enough
LOWmarker/processors/llm/llm_table_merge.py297 # Check if the number of rows is the same
LOWmarker/processors/llm/llm_table_merge.py302 # Check if the number of columns is the same
LOWmarker/scripts/chunk_convert.sh5# Check if NUM_DEVICES is set
LOWmarker/scripts/extraction_app.py83 # Check if this is a new file
LOWmarker/scripts/extraction_app.py196# Check if schema is provided before running
LOWmarker/scripts/common.py107 # Check if this class inherits from BaseModel
LOWmarker/scripts/common.py129 # Check if this field references another class
Self-Referential Comments6 hits · 16 pts
SeverityFileLineSnippet
MEDIUMexamples/marker_modal_deployment.py9# Define the Modal app
MEDIUMexamples/marker_modal_deployment.py14# Define the container image with all dependencies
MEDIUMexamples/marker_modal_deployment.py30# Create a persistent volume for model caching
MEDIUMmarker/builders/structure.py68 # Create a merged block
MEDIUMmarker/scripts/extraction_app.py80 # Create a unique identifier for the current file
MEDIUMmarker/scripts/extraction_app.py109 # Initialize schema variable
AI Slop Vocabulary2 hits · 4 pts
SeverityFileLineSnippet
MEDIUMdata/examples/json/multicolcnn.json1164 "html": "<p block-type=\"Text\">We propose the use of dilated convolutions as an attractive alternative to the
MEDIUMdata/examples/json/multicolcnn.json1865 "html": "<p block-type=\"Text\">Furthermore, we performed a set of experiments in which we varied the number o
Slop Phrases1 hit · 3 pts
SeverityFileLineSnippet
MEDIUMexamples/marker_modal_deployment.py304# that you can use to test your deployment. It'll store the
Over-Commented Block3 hits · 3 pts
SeverityFileLineSnippet
LOW…/examples/markdown/switch_transformers/switch_trans.md561import mesh tensorflow as mtf
LOW…/examples/markdown/switch_transformers/switch_trans.md661# d model = model hidden size (scalar).
LOW…/examples/markdown/switch_transformers/switch_trans.md681# probability.