Skip to content

Configuration Guide

Databrew uses TOML configuration files to define extraction rules. This guide covers all configuration options.

Basic Structure

A config file has these main sections:

# Required: Site identity
name = "mysite"
start_urls = ["https://example.com"]

# Optional: Inherit from base config
# extends = "base.toml"

# Required: Extraction rules
[extract]
type = "html"  # or "json"

# Item extraction
[extract.items]
# ...

# Link discovery
[extract.links]
# ...

# Optional: Crawl behavior
[policy]
# ...

# Optional: Output location
[storage]
# ...

# Optional: HTTP/browser settings
[fetch]
# ...

Top-Level Settings

name

Site identifier used for logging and default output paths.

name = "mysite"

start_urls

URLs to begin crawling from. Can be a list or loaded from a file.

# List of URLs
start_urls = [
    "https://example.com/page1",
    "https://example.com/page2",
]

# Load from file (one URL per line)
start_urls = { file = "urls.txt" }

start_urls.file is resolved relative to the config file location.

File format:

# Comments are ignored
https://example.com/category/1
https://example.com/category/2

# Blank lines are ignored
https://example.com/category/3

extends

Inherit from a base config file. Path is relative to the config file.

extends = "base.toml"

See Config Composition for details.

parsers

Custom parser modules to load (Python files in the same directory as the config).

parsers = ["my_parsers"]  # Loads my_parsers.py

See Custom Parsers for details.

[extract] Section

type

Extraction type: html for web pages, json for APIs.

[extract]
type = "html"

items_from

Which URL types to extract items from. Default is "item".

[extract]
items_from = "item"        # Only from item/detail pages (default)
# items_from = "pagination" # Only from listing pages
# items_from = "all"        # From all pages

base_url (HTML only)

Base URL for resolving relative links.

[extract]
type = "html"
base_url = "https://example.com"

[extract.items] Section

selector (HTML)

CSS selector for item containers. Empty string means the whole page is one item.

[extract.items]
selector = ".product-card"  # Multiple items per page
# selector = ""             # Whole page is one item (detail pages)

path (JSON)

JSON path to items array. Empty string means the full response.

[extract.items]
path = "data.products"  # Array at data.products
# path = ""             # Full response is the item

id

Field to use as unique identifier for deduplication and export filenames.

[extract.items]
id = "product_id"  # Name of an extracted field

# For nested data (JSON or derived fields)
id = "details.Property ID"

fields

Fields to extract from each item.

[extract.items.fields]
# Simple selector (HTML) or path (JSON)
title = "h2.title"

# Full config
price = { selector = ".price", parser = "parse_price", required = true }
image = { selector = "img", attribute = "src" }
tags = { selector = ".tag", multiple = true }

See HTML Extraction and JSON Extraction for field options.

pagination (HTML/JSON)

Selectors/paths for pagination links (always followed).

[extract.links]
# HTML: CSS selectors
pagination = ["a.next-page", ".pagination a"]

# JSON: dot-notation paths
pagination = ["links.next", "meta.next_page_url"]

items (HTML)

CSS selectors for item detail page links.

[extract.links]
items = [".product-card a", ".listing-link"]

Item URL Construction (JSON)

For JSON APIs, construct item URLs from IDs:

[extract.links]
items_path = "data.products"  # Path to items array
items_id = "id"               # ID field in each item
items_url = "https://api.example.com/products/{id}"  # URL template

attribute (HTML)

Attribute containing the URL (default: href).

[extract.links]
attribute = "href"

[extract.derived] Section

Extract fields from already-extracted nested data.

[extract.derived]
# Simple path
property_id = "details.Property ID"

# With parser
bedrooms = { path = "details.Bedrooms", parser = "parse_int" }

# Keep source field (default removes it)
status = { path = "info.status", remove_source = false }

See HTML Extraction for details.

[policy] Section

Controls crawl behavior, retries, and stopping conditions.

Retry Settings

[policy]
max_retries = 3           # Retry attempts per URL (default: 3)
retry_delay = 1.0         # Initial retry delay in seconds (default: 1.0)
backoff_factor = 2.0      # Multiply delay after each retry (default: 2.0)
max_retry_delay = 60.0    # Maximum retry delay (default: 60.0)

Concurrency and Pacing

[policy]
concurrency = 5           # Parallel requests (default: 5)
delay = 1.0               # Delay after each batch in seconds (default: 0.0)
jitter = 0.2              # Random delay 0-N seconds before each request (default: 0.1)

Stopping Conditions

[policy]
max_requests = 1000       # Stop after N requests (default: unlimited)
max_consecutive_failures = 50  # Stop after N consecutive failures (default: 50)
max_error_rate = 0.5      # Stop if error rate exceeds 50% (default: 0.5)
min_requests_for_error_rate = 20  # Min requests before error rate check (default: 20)

Incremental Crawling

[policy]
stop_on_empty = true      # Stop branch when page yields no items/links (default: true)
stop_on_caught_up = false # Global stop on caught-up (default: false)
caught_up_threshold = 3   # Pages before global stop (default: 3)

[storage] Section

path

Output directory for .state.db, .index.db, and items/*.parquet. Relative to CWD (current working directory).

[storage]
path = "data/mysite"  # Creates ./data/mysite/.state.db, .index.db, and ./items/

Note

The path is relative to where you run databrew, not the config file location.

target_max_file_mb

Target maximum size (in MB) for each Parquet part file written during crawling.

[storage]
target_max_file_mb = 50  # Git-safe default

Databrew will split writes into more part files when needed to stay near this cap.

max_pending_items

Number of buffered items before Databrew flushes a new Parquet part.

[storage]
max_pending_items = 1000

Increase this to reduce tiny-file creation during long initial crawls.

flush_policy

When buffered items are flushed to Parquet.

[storage]
flush_policy = "finalize"  # or "periodic"
  • finalize (default): writes buffered items at shutdown and controlled flush points.
  • periodic: continuously flushes based on max_pending_items.

ephemeral_index

Delete .index.db on clean close. Useful when you want a fully rebuildable local index.

[storage]
ephemeral_index = false

state_db_name / index_db_name / items_dir_name

Optional local naming overrides inside [storage].path.

[storage]
state_db_name = ".state.db"
index_db_name = ".index.db"
items_dir_name = "items"

compression

Parquet compression codec. Allowed values: snappy, zstd, gzip, none.

[storage]
compression = "snappy"

Invalid values are rejected during config validation (for example, snapy).

[hooks] Section

Lifecycle hooks run shell commands at key points during a crawl. Commands support template variables: {name}, {failures}, {items}, {requests}.

[hooks]
on_start = "echo Starting {name}"
on_failure = "python scripts/recover.py {name}"
on_complete = "python scripts/notify.py {name} {items}"
max_hook_retries = 3       # Max times on_failure can fire (default: 3)
hook_timeout = 300.0       # Timeout per hook in seconds (default: 300)

The on_failure hook is the most useful: when the crawl hits max_consecutive_failures, it runs your recovery script. If the script exits 0, the crawl resets its failure counter, reloads config, and resumes.

CLI flags (--on-start, --on-failure, --on-complete) override config values.

See Lifecycle Hooks for full details and examples.

[fetch] Section

type

Fetcher type: httpx (HTTP client) or pydoll (browser).

[fetch]
type = "httpx"  # Default
# type = "pydoll"  # Browser-based fetching

headers

HTTP headers to send with requests.

[fetch.headers]
User-Agent = "MyBot/1.0"
Accept = "text/html"
Authorization = "Bearer token123"

[fetch.browser] (pydoll only)

Browser-specific settings when using type = "pydoll".

[fetch]
type = "pydoll"

[fetch.browser]
headless = true                    # Run without GUI (default: true)
page_load_timeout = 30.0           # Timeout in seconds (default: 30.0)
wait_for_selector = ".content"     # Wait for element before extraction
selector_timeout = 10.0            # Selector wait timeout (default: 10.0)
wait_for_network_idle = false      # Wait for network to settle (default: false)
network_idle_time = 2.0            # Network idle wait time (default: 2.0)
wait_after_load = 0.0              # Additional delay after load (default: 0.0)
user_agent = "Mozilla/5.0 ..."     # Custom user agent
viewport_width = 1920              # Browser viewport width (default: 1920)
viewport_height = 1080             # Browser viewport height (default: 1080)

See Browser Fetching for details.

Config Composition

Configs can inherit from a base config using extends:

# base.toml
[fetch.headers]
User-Agent = "MyBot/1.0"

[policy]
max_retries = 3
concurrency = 5
delay = 1.0
# mysite.toml
extends = "base.toml"
name = "mysite"
start_urls = ["https://example.com"]

[extract]
type = "html"
# ... site-specific rules

Merge Behavior

  • Dicts: Merge recursively (child values override base)
  • Lists: Replace entirely (no concatenation)
  • Scalars: Replace entirely

Chained Inheritance

# base.toml → common.toml → mysite.toml
extends = "common.toml"  # common.toml can also extend base.toml

Complete Example

name = "realestate"
start_urls = ["https://example.com/listings"]

[extract]
type = "html"
base_url = "https://example.com"

[extract.items]
selector = ""  # Detail pages
id = "property_id"

[extract.items.fields]
title = ".listing-title"
price = { selector = ".price", parser = "parse_price", required = true }
address = ".address"
bedrooms = { selector = ".beds", parser = "parse_int" }
bathrooms = { selector = ".baths", parser = "parse_int" }
sqft = { selector = ".sqft", parser = "parse_int" }
description = ".description"
images = { selector = ".gallery img", attribute = "src", multiple = true }
details = { selector = ".details li", keys = "strong", values = "span" }
coordinates = { selector = "#map", attribute = "data-coords", parser = "parse_coordinates" }

[extract.links]
pagination = [".pagination a.next"]
items = [".listing-card a"]

[extract.derived]
property_id = "details.Property ID"
year_built = { path = "details.Year Built", parser = "parse_int" }
lot_size = "details.Lot Size"

[policy]
concurrency = 3
delay = 1.0
jitter = 0.2
max_retries = 3
max_requests = 5000

[storage]
path = "data/realestate"

[fetch.headers]
User-Agent = "RealEstateBot/1.0"

# [hooks]
# on_failure = "python scripts/recover.py {name}"
# on_complete = "python scripts/notify.py {name} {items}"
# max_hook_retries = 3
# hook_timeout = 300.0