Custom Parsers¶
Parsers transform extracted values. You can create custom parsers for site-specific transformations.
Creating a Parser¶
A parser is a function that takes a string and returns the transformed value:
Registering Parsers¶
Runtime Registration¶
Register parsers at runtime with register_parser:
from databrew import register_parser
def parse_phone(text: str) -> str:
"""Clean phone number."""
import re
digits = re.sub(r"[^\d+]", "", text)
return digits
register_parser("parse_phone", parse_phone)
Then use in config:
Module-Based Registration¶
Create a Python file with parsers and load it via config:
# my_parsers.py
from databrew import register_parser
def parse_ethiopian_date(text: str) -> str:
"""Convert Ethiopian date to Gregorian."""
# ... conversion logic ...
return gregorian_date
def parse_amharic_number(text: str) -> int:
"""Parse Amharic numerals."""
# ... parsing logic ...
return number
# Auto-register when module loads
register_parser("parse_ethiopian_date", parse_ethiopian_date)
register_parser("parse_amharic_number", parse_amharic_number)
Reference in config:
parsers = ["my_parsers"] # Loads my_parsers.py from config directory
[extract.items.fields]
date = { selector = ".date", parser = "parse_ethiopian_date" }
count = { selector = ".count", parser = "parse_amharic_number" }
Parser Return Types¶
Parsers can return any JSON-serializable type:
# String
def uppercase(text: str) -> str:
return text.upper()
# Integer
def to_int(text: str) -> int | None:
try:
return int(text.replace(",", ""))
except ValueError:
return None
# Float
def to_float(text: str) -> float | None:
try:
return float(text.replace(",", ""))
except ValueError:
return None
# Dict
def parse_dimensions(text: str) -> dict:
# "10x20x30 cm" -> {"width": 10, "height": 20, "depth": 30, "unit": "cm"}
parts = text.split("x")
return {
"width": int(parts[0]),
"height": int(parts[1]),
"depth": int(parts[2].split()[0]),
"unit": parts[2].split()[1],
}
# List
def split_tags(text: str) -> list:
return [tag.strip() for tag in text.split(",")]
Element-Aware Parsers¶
For complex extraction, parsers can receive the BeautifulSoup element instead of text:
from bs4 import Tag
from databrew import register_parser
def parse_table(element: Tag) -> dict:
"""Extract data from a table element."""
rows = element.select("tr")
data = {}
for row in rows:
cells = row.select("td")
if len(cells) >= 2:
key = cells[0].get_text(strip=True)
value = cells[1].get_text(strip=True)
data[key] = value
return data
# Mark as element-aware (receives Tag, not text)
parse_table._element_parser = True
register_parser("parse_table", parse_table)
Or use the decorator:
from databrew.extract.html import element_parser
@element_parser
def parse_table(element: Tag) -> dict:
# ...
Or use type annotations (auto-detected):
Error Handling¶
Parsers should handle errors gracefully:
def safe_parse_int(text: str) -> int | None:
"""Parse integer, returning None on failure."""
if not text:
return None
try:
return int(text.replace(",", "").strip())
except (ValueError, AttributeError):
return None
If a parser raises an exception:
- The field value becomes
None - A warning is logged (visible with
-v) - The item is not failed (unless the field is
required)
Using with Derived Fields¶
Parsers work with derived fields too:
[extract.items.fields]
details = { selector = ".details li", keys = "strong", values = "span" }
[extract.derived]
bedrooms = { path = "details.Bedrooms", parser = "parse_int" }
price = { path = "details.Price", parser = "parse_price" }
Common Parser Patterns¶
Clean and Normalize¶
import re
def clean_text(text: str) -> str:
"""Remove extra whitespace and normalize."""
text = re.sub(r"\s+", " ", text)
return text.strip()
Extract with Regex¶
import re
def extract_id(text: str) -> str | None:
"""Extract ID from text like 'ID: 12345'."""
match = re.search(r"ID:\s*(\d+)", text)
return match.group(1) if match else None
Parse Structured Text¶
def parse_address(text: str) -> dict:
"""Parse address into components."""
# "123 Main St, City, State 12345"
parts = text.split(",")
return {
"street": parts[0].strip() if len(parts) > 0 else None,
"city": parts[1].strip() if len(parts) > 1 else None,
"state_zip": parts[2].strip() if len(parts) > 2 else None,
}
Convert Units¶
def sqft_to_sqm(text: str) -> float | None:
"""Convert square feet to square meters."""
import re
match = re.search(r"[\d,]+", text)
if match:
sqft = float(match.group().replace(",", ""))
return round(sqft * 0.092903, 2)
return None
Handle Multiple Formats¶
def parse_date(text: str) -> str | None:
"""Parse various date formats to ISO."""
from datetime import datetime
formats = [
"%Y-%m-%d",
"%d/%m/%Y",
"%m/%d/%Y",
"%B %d, %Y",
"%d %B %Y",
]
for fmt in formats:
try:
dt = datetime.strptime(text.strip(), fmt)
return dt.isoformat()
except ValueError:
continue
return None
Complete Example¶
# site_parsers.py
import re
from bs4 import Tag
from databrew import register_parser
def parse_birr(text: str) -> dict | None:
"""Parse Ethiopian Birr price."""
if not text:
return None
# Remove "ETB", "Birr", commas
cleaned = re.sub(r"[ETB|Birr|,\s]", "", text, flags=re.IGNORECASE)
try:
amount = float(cleaned)
return {
"amount": amount,
"currency": "ETB",
"raw": text,
}
except ValueError:
return None
def parse_sqm(text: str) -> float | None:
"""Parse square meters from text."""
match = re.search(r"[\d,]+\.?\d*", text)
if match:
return float(match.group().replace(",", ""))
return None
def parse_features_list(element: Tag) -> list:
"""Extract features from a list element."""
items = element.select("li")
return [item.get_text(strip=True) for item in items]
parse_features_list._element_parser = True
# Register all parsers
register_parser("parse_birr", parse_birr)
register_parser("parse_sqm", parse_sqm)
register_parser("parse_features_list", parse_features_list)
Config: