Configuration Reference
This is a complete reference for all TOML configuration options.
Top-Level Settings
| Key |
Type |
Required |
Default |
Description |
name |
string |
Yes |
- |
Site identifier |
start_urls |
list or table |
Yes |
- |
Starting URLs |
extends |
string |
No |
- |
Base config to inherit from |
parsers |
list |
No |
[] |
Custom parser modules to load |
start_urls
# List of URLs
start_urls = ["https://example.com/page1", "https://example.com/page2"]
# Load from file
start_urls = { file = "urls.txt" }
| Key |
Type |
Default |
Description |
type |
"html" | "json" |
"html" |
Extraction type |
items_from |
"item" | "pagination" | "all" |
"item" |
Which URL types to save items from |
base_url |
string |
null |
Base URL for relative links (HTML only) |
HTML Items Config
| Key |
Type |
Default |
Description |
selector |
string |
"" |
CSS selector for item containers |
id |
string |
null |
Field name or path for deduplication |
fields |
table |
{} |
Field extraction definitions |
JSON Items Config
| Key |
Type |
Default |
Description |
path |
string |
null |
Dot-notation path to items array |
id |
string |
null |
Path to unique identifier |
fields |
table |
null |
Field definitions (null = export full item) |
HTML Field Config
[extract.items.fields]
# Simple selector (text content)
title = "h1.title"
# Full config
field = {
selector = "string", # CSS selector (required unless using keys/values)
attribute = "string", # Attribute to extract (null = text content)
parser = "string", # Parser function name
required = false, # Fail item if missing
multiple = false, # Extract all matches as list
keys = "string", # CSS selector for keys (key-value extraction)
values = "string", # CSS selector for values (key-value extraction)
units = "string", # CSS selector for units (appended to values)
}
JSON Field Config
[extract.items.fields]
# Simple path
title = "data.title"
# Full config
field = {
path = "string", # Dot-notation path (required)
parser = "string", # Parser function name
required = false, # Fail item if missing
}
HTML Links Config
| Key |
Type |
Default |
Description |
pagination |
list |
[] |
CSS selectors for pagination links |
items |
list |
[] |
CSS selectors for item detail links |
attribute |
string |
"href" |
Attribute containing the URL |
base_url |
string |
null |
Base URL for relative links |
JSON Links Config
| Key |
Type |
Default |
Description |
pagination |
list |
[] |
Dot-notation paths to pagination URLs |
items_path |
string |
null |
Path to items array for URL construction |
items_id |
string |
null |
ID field within each item |
items_url |
string |
null |
URL template with {id} placeholder |
Derived fields extract values from already-extracted nested data.
[extract.derived]
# Simple path
property_id = "details.Property ID"
# Full config
field = {
path = "string", # Dot-notation path (required)
parser = "string", # Parser function name
remove_source = true, # Remove key from source dict
}
[policy] Section
Retry Settings
| Key |
Type |
Default |
Description |
max_retries |
int >= 0 |
3 |
Maximum retry attempts per URL |
retry_delay |
float > 0 |
1.0 |
Initial retry delay (seconds) |
backoff_factor |
float >= 1 |
2.0 |
Delay multiplier after each retry |
max_retry_delay |
float > 0 |
60.0 |
Maximum retry delay (seconds) |
Concurrency and Pacing
| Key |
Type |
Default |
Description |
concurrency |
int >= 1 |
5 |
Parallel requests |
delay |
float >= 0 |
0.0 |
Delay after each batch (seconds) |
jitter |
float >= 0 |
0.1 |
Random delay before each request (seconds) |
Stopping Conditions
| Key |
Type |
Default |
Description |
max_requests |
int >= 1 |
null |
Maximum requests (retries don't count) |
max_consecutive_failures |
int >= 1 |
10 |
Stop after N consecutive failures |
max_error_rate |
float 0-1 |
0.5 |
Stop if error rate exceeds this |
min_requests_for_error_rate |
int >= 1 |
20 |
Min requests before error rate check |
Incremental Settings
| Key |
Type |
Default |
Description |
stop_on_empty |
bool |
true |
Stop branch on empty page |
stop_on_caught_up |
bool |
false |
Global stop on caught-up |
caught_up_threshold |
int >= 1 |
3 |
Consecutive caught-up pages for global stop |
Checkpointing
| Key |
Type |
Default |
Description |
checkpoint_every |
int >= 1 |
100 |
Checkpoint after N items |
[storage] Section
| Key |
Type |
Default |
Description |
path |
string |
"data/{name}" |
Output directory (relative to CWD) |
[fetch] Section
| Key |
Type |
Default |
Description |
type |
"httpx" | "pydoll" |
"httpx" |
Fetcher type |
HTTP headers to send with requests:
[fetch.headers]
User-Agent = "MyBot/1.0"
Accept = "text/html"
Authorization = "Bearer token"
[fetch.browser] (pydoll only)
| Key |
Type |
Default |
Description |
headless |
bool |
true |
Run without GUI |
page_load_timeout |
float > 0 |
30.0 |
Page load timeout (seconds) |
wait_for_network_idle |
bool |
false |
Wait for network to settle |
network_idle_time |
float >= 0 |
2.0 |
Network idle wait time (seconds) |
wait_for_selector |
string |
null |
CSS selector to wait for |
selector_timeout |
float > 0 |
10.0 |
Selector wait timeout (seconds) |
wait_after_load |
float >= 0 |
0.0 |
Additional delay after load (seconds) |
user_agent |
string |
null |
Custom user agent |
viewport_width |
int >= 1 |
1920 |
Browser viewport width |
viewport_height |
int >= 1 |
1080 |
Browser viewport height |
Complete Example
name = "complete-example"
start_urls = ["https://example.com/listings"]
extends = "base.toml"
parsers = ["custom_parsers"]
[extract]
type = "html"
items_from = "item"
base_url = "https://example.com"
[extract.items]
selector = ""
id = "property_id"
[extract.items.fields]
title = ".title"
price = { selector = ".price", parser = "parse_price", required = true }
address = ".address"
description = { selector = ".desc", parser = "squish" }
bedrooms = { selector = ".beds", parser = "parse_int" }
images = { selector = ".gallery img", attribute = "src", multiple = true }
details = { selector = ".details li", keys = "strong", values = "span" }
date_listed = { selector = "script[type='application/ld+json']", parser = "ldjson:datePosted" }
[extract.links]
pagination = [".pagination a.next"]
items = [".listing-card a"]
attribute = "href"
[extract.derived]
property_id = "details.Property ID"
year_built = { path = "details.Year Built", parser = "parse_int" }
lot_size = { path = "details.Lot Size", remove_source = false }
[policy]
max_retries = 3
retry_delay = 1.0
backoff_factor = 2.0
max_retry_delay = 60.0
concurrency = 5
delay = 1.0
jitter = 0.2
max_requests = 5000
max_consecutive_failures = 10
max_error_rate = 0.5
stop_on_empty = true
stop_on_caught_up = false
checkpoint_every = 100
[storage]
path = "data/example"
[fetch]
type = "httpx"
[fetch.headers]
User-Agent = "ExampleBot/1.0"
Accept = "text/html"