Skip to content

Configuration Reference

This is a complete reference for all TOML configuration options.

Top-Level Settings

Key Type Required Default Description
name string Yes - Site identifier
start_urls list or table Yes - Starting URLs
extends string No - Base config to inherit from
parsers list No [] Custom parser modules to load

start_urls

# List of URLs
start_urls = ["https://example.com/page1", "https://example.com/page2"]

# Load from file
start_urls = { file = "urls.txt" }

start_urls.file is resolved relative to the config file directory.

[extract] Section

Key Type Default Description
type "html" | "json" "html" Extraction type
items_from "item" | "pagination" | "all" "item" Which URL types to save items from
base_url string null Base URL for relative links (HTML only)

[extract.items] Section

HTML Items Config

Key Type Default Description
selector string "" CSS selector for item containers
id string null Field name or path for deduplication
fields table {} Field extraction definitions

JSON Items Config

Key Type Default Description
path string null Dot-notation path to items array
id string null Path to unique identifier
fields table null Field definitions (null = export full item)

[extract.items.fields] Section

HTML Field Config

[extract.items.fields]
# Simple selector (text content)
title = "h1.title"

# Full config
field = {
    selector = "string",      # CSS selector (required unless using keys/values)
    attribute = "string",     # Attribute to extract (null = text content)
    parser = "string",        # Parser function name
    required = false,         # Fail item if missing
    multiple = false,         # Extract all matches as list
    keys = "string",          # CSS selector for keys (key-value extraction)
    values = "string",        # CSS selector for values (key-value extraction)
    units = "string",         # CSS selector for units (appended to values)
}

JSON Field Config

[extract.items.fields]
# Simple path
title = "data.title"

# Full config
field = {
    path = "string",          # Dot-notation path (required)
    parser = "string",        # Parser function name
    required = false,         # Fail item if missing
}
Key Type Default Description
pagination list [] CSS selectors for pagination links
items list [] CSS selectors for item detail links
attribute string "href" Attribute containing the URL
base_url string null Base URL for relative links
Key Type Default Description
pagination list [] Dot-notation paths to pagination URLs
items_path string null Path to items array for URL construction
items_id string null ID field within each item
items_url string null URL template with {id} placeholder

[extract.derived] Section

Derived fields extract values from already-extracted nested data.

[extract.derived]
# Simple path
property_id = "details.Property ID"

# Full config
field = {
    path = "string",          # Dot-notation path (required)
    parser = "string",        # Parser function name
    remove_source = true,     # Remove key from source dict
}

[policy] Section

Retry Settings

Key Type Default Description
max_retries int >= 0 3 Maximum retry attempts per URL
retry_delay float > 0 1.0 Initial retry delay (seconds)
backoff_factor float >= 1 2.0 Delay multiplier after each retry
max_retry_delay float > 0 60.0 Maximum retry delay (seconds)

Concurrency and Pacing

Key Type Default Description
concurrency int >= 1 5 Parallel requests
delay float >= 0 0.0 Delay after each batch (seconds)
jitter float >= 0 0.1 Random delay before each request (seconds)

Stopping Conditions

Key Type Default Description
max_requests int >= 1 null Maximum requests
max_consecutive_failures int >= 1 50 Stop after N consecutive failures
max_error_rate float 0-1 0.5 Stop if error rate exceeds this
min_requests_for_error_rate int >= 1 20 Min requests before error rate check

Incremental Settings

Key Type Default Description
stop_on_empty bool true Stop branch on empty page
stop_on_caught_up bool false Global stop on caught-up
caught_up_threshold int >= 1 3 Consecutive caught-up pages for global stop

[hooks] Section

Lifecycle hooks for automated recovery and notifications.

Key Type Default Description
on_start string null Shell command before crawl (exit non-zero to abort)
on_failure string null Shell command on consecutive failure stop (exit 0 to resume)
on_complete string null Shell command after crawl finishes
max_hook_retries int >= 1 3 Max times on_failure can fire per crawl
hook_timeout float > 0 300.0 Timeout per hook execution (seconds)

Commands support template variables: {name}, {failures}, {items}, {requests}.

[hooks]
on_failure = "python scripts/recover.py {name}"
on_complete = "echo Done: {name} {items} items"
max_hook_retries = 5
hook_timeout = 60.0

[storage] Section

Key Type Default Description
path string "data/{name}" Output directory (relative to CWD)
max_pending_items int > 0 1000 Flush when this many items are buffered
flush_policy "finalize" | "periodic" "finalize" Buffer flush behavior
target_max_file_mb float > 0 50.0 Target max size per Parquet part file
compression "snappy" | "zstd" | "gzip" | "none" "snappy" Parquet compression codec
ephemeral_index bool false Delete .index.db on clean close
state_db_name string ".state.db" Queue state database filename
index_db_name string ".index.db" Storage catalog database filename
items_dir_name string "items" Directory name for item files

[fetch] Section

Key Type Default Description
type "httpx" | "pydoll" "httpx" Fetcher type

[fetch.headers]

HTTP headers to send with requests:

[fetch.headers]
User-Agent = "MyBot/1.0"
Accept = "text/html"
Authorization = "Bearer token"

[fetch.browser] (pydoll only)

Key Type Default Description
headless bool true Run without GUI
page_load_timeout float > 0 30.0 Page load timeout (seconds)
wait_for_network_idle bool false Wait for network to settle
network_idle_time float >= 0 2.0 Network idle wait time (seconds)
wait_for_selector string null CSS selector to wait for
selector_timeout float > 0 10.0 Selector wait timeout (seconds)
wait_after_load float >= 0 0.0 Additional delay after load (seconds)
user_agent string null Custom user agent
viewport_width int >= 1 1920 Browser viewport width
viewport_height int >= 1 1080 Browser viewport height

Complete Example

name = "complete-example"
start_urls = ["https://example.com/listings"]
extends = "base.toml"
parsers = ["custom_parsers"]

[extract]
type = "html"
items_from = "item"
base_url = "https://example.com"

[extract.items]
selector = ""
id = "property_id"

[extract.items.fields]
title = ".title"
price = { selector = ".price", parser = "parse_price", required = true }
address = ".address"
description = { selector = ".desc", parser = "squish" }
bedrooms = { selector = ".beds", parser = "parse_int" }
images = { selector = ".gallery img", attribute = "src", multiple = true }
details = { selector = ".details li", keys = "strong", values = "span" }
date_listed = { selector = "script[type='application/ld+json']", parser = "ldjson:datePosted" }

[extract.links]
pagination = [".pagination a.next"]
items = [".listing-card a"]
attribute = "href"

[extract.derived]
property_id = "details.Property ID"
year_built = { path = "details.Year Built", parser = "parse_int" }
lot_size = { path = "details.Lot Size", remove_source = false }

[policy]
max_retries = 3
retry_delay = 1.0
backoff_factor = 2.0
max_retry_delay = 60.0
concurrency = 5
delay = 1.0
jitter = 0.2
max_requests = 5000
max_consecutive_failures = 50
max_error_rate = 0.5
stop_on_empty = true
stop_on_caught_up = false

[storage]
path = "data/example"
compression = "snappy"

[hooks]
on_failure = "python scripts/recover.py {name}"
on_complete = "echo Done: {name} {items} items"
max_hook_retries = 3
hook_timeout = 300.0

[fetch]
type = "httpx"

[fetch.headers]
User-Agent = "ExampleBot/1.0"
Accept = "text/html"